Exemple #1
0
def parseStream(content, uniqueId=""):
    tokens = []
    line = column = sol = 1
    scanner = Scanner.LQueue(tokens_2_obj(content, ))
    scanner.content = content
    scanner.slice = scanner_slice
    for tok in scanner:
        # some inital values (tok isinstanceof Scanner.Token())
        token = {
            "source" : tok.value, 
            "detail" : "",
            "line"   : line, 
            "column" : tok.spos - sol + 1, 
            "id"     : uniqueId
            }

        # white space
        if (tok.name == 'white'):
            continue

        # end of file
        elif tok.name == 'eof':
            token['type'] = 'eof'
        
        # line break
        elif tok.name == 'nl':
            token['type']   = 'eol'
            token['source'] = ''    # that's the way the old tokenizer does it
            line += 1                  # increase line count
            sol  = tok.spos + tok.len  # char pos of next line start
        
        # float
        elif tok.name == 'float':
            token['type'] = 'number'
            token['detail'] = 'float'
        
        # hex integer
        elif tok.name == 'hexnum':
            token['type'] = 'number'
            token['detail'] = 'int'
        
        # integer
        elif tok.name == 'number':
            token['type'] = 'number'
            token['detail'] = 'int'
        
        # string
        elif tok.value in ('"', "'"):
            # accumulate strings
            token['type'] = 'string'
            if tok.value == '"':
                token['detail'] = 'doublequotes'
            else:
                token['detail'] = 'singlequotes'
            try:
                token['source'] = parseString(scanner, tok.value)
            except SyntaxException, e:
                desc = e.args[0] + " starting with %r..." % (tok.value + e.args[1])[:20]
                raiseSyntaxException(token, desc)
            token['source'] = token['source'][:-1]
            # adapt line number -- this assumes multi-line strings are not generally out
            linecnt = len(re.findall("\n", token['source']))
            if linecnt > 0:
                line += linecnt

        # identifier, operator
        elif tok.name in ("ident", "op", "mulop"):

            # JS operator symbols
            if tok.value in lang.TOKENS:
                # division, div-assignment, regexp
                if tok.value in ('/', '/='):
                    # accumulate regex literals
                    if (len(tokens) == 0 or (
                            (tokens[-1]['type']   != 'number') and
                            (tokens[-1]['detail'] != 'RP')     and
                            (tokens[-1]['detail'] != 'RB')     and
                            (tokens[-1]['type']   != 'name'))):
                        regexp = parseRegexp(scanner)
                        token['type'] = 'regexp'
                        token['source'] = tok.value + regexp
                    else:
                        token['type'] = 'token'
                        token['detail'] = lang.TOKENS[tok.value]

                # comment, inline
                elif tok.value == '//':
                    # accumulate inline comments
                    if (len(tokens) == 0 or
                        not is_last_escaped_token(tokens)):
                        commnt = parseCommentI(scanner)
                        token['type'] = 'comment'
                        token['source'] = tok.value + commnt
                        token['begin'] = not hasLeadingContent(tokens)
                        token['end'] = True
                        token['connection'] = "before" if token['begin'] else "after"  # "^//...\n i=1;" => comment *before* code; "i=1; //..." => comment *after* code
                        token['multiline'] = False
                        token['detail'] = 'inline'
                    else:
                        print >> sys.stderror, "Inline comment out of context"
                
                # comment, multiline
                elif tok.value == '/*':
                    # accumulate multiline comments
                    if (len(tokens) == 0 or
                        not is_last_escaped_token(tokens)):
                        token['type'] = 'comment'
                        try:
                            commnt = parseCommentM(scanner)
                        except SyntaxException, e:
                            desc = e.args[0] + " starting with \"%r...\"" % (tok.value + e.args[1])[:20]
                            raiseSyntaxException(token, desc)
                        token['source'] = tok.value + commnt
                        token['detail'] = comment.getFormat(token['source'])
                        token['begin'] = not hasLeadingContent(tokens)
                        if restLineIsEmpty(scanner):
                            token['end'] = True
                        else:
                            token['end'] = False
                        if token['begin']:
                            token['source'] = comment.outdent(token['source'], column - 1)
                        token['source'] = comment.correct(token['source'])
                        if token['end'] and not token['begin']:
                            token['connection'] = "after"
                        else:
                            token['connection'] = "before"
                        # adapt line number
                        linecnt = len(re.findall("\n", token['source']))
                        if linecnt > 0:
                            line += linecnt
                            token['multiline'] = True
                        else:
                            token['multiline'] = False

                    else:
                        print >> sys.stderror, "Multiline comment out of context"
                                
                # every other operator goes as is
                else:
                    token['type'] = 'token'
                    token['detail'] = lang.TOKENS[tok.value]
def parseStream(content, uniqueId=""):
    # make global variables available
    global parseLine
    global parseColumn
    global parseUniqueId

    # reset global stuff
    parseColumn = 1
    parseLine = 1
    parseUniqueId = uniqueId

    # prepare storage
    tokens = []
    content = protectEscape(content)

    # print "      * searching for patterns..."
    try:
        all = R_ALL.findall(content)
    except RuntimeError:
        print "Could not parse file %s" % uniqueId
        print "Generally this means that there is a syntactial problem with your source-code."
        print "Please omit the usage of nested comments like '/* foo /* bar */'."
        sys.exit(1)

    # print "      * structuring..."

    # for item in all:
    #    if type(item) != types.TupleType:   # item's no longer a tuple!
    #        item = (item,)
    #    fragment = item[0]

    while content:
        mo = R_ALL.search(content)
        if mo:
            fragment = mo.group(0)
        else:
            break

        # print "Found: '%s'" % fragment

        # Handle block comment
        if comment.R_BLOCK_COMMENT.match(fragment):
            source = recoverEscape(fragment)
            format = comment.getFormat(source)
            multiline = comment.isMultiLine(source)

            # print "Type:MultiComment"
            content = parseFragmentLead(content, fragment, tokens)  # sort of intelligent "pop"

            atBegin = not hasLeadingContent(tokens)
            if re.compile("^\s*\n").search(content):
                atEnd = True
            else:
                atEnd = False

            # print "Begin: %s, End: %s" % (atBegin, atEnd)

            # Fixing source content
            if atBegin:
                source = comment.outdent(source, parseColumn - 1)

            source = comment.correct(source)

            connection = "before"

            if atEnd and not atBegin:
                connection = "after"
            else:
                connection = "before"

            tokens.append(
                {
                    "type": "comment",
                    "detail": format,
                    "multiline": multiline,
                    "connection": connection,
                    "source": source,
                    "id": parseUniqueId,
                    "line": parseLine,
                    "column": parseColumn,
                    "begin": atBegin,
                    "end": atEnd,
                }
            )
            parseLine += len(fragment.split("\n")) - 1

        # Handle inline comment
        elif comment.R_INLINE_COMMENT.match(fragment):
            # print "Type:SingleComment"
            source = recoverEscape(fragment)
            content = parseFragmentLead(content, fragment, tokens)

            atBegin = hasLeadingContent(tokens)
            atEnd = True

            if atBegin:
                connection = "after"
            else:
                connection = "before"

            source = comment.correct(source)

            tokens.append(
                {
                    "type": "comment",
                    "detail": "inline",
                    "multiline": False,
                    "connection": connection,
                    "source": source,
                    "id": parseUniqueId,
                    "line": parseLine,
                    "column": parseColumn,
                    "begin": atBegin,
                    "end": atEnd,
                }
            )

        # Handle string
        elif R_STRING_A.match(fragment):
            # print "Type:StringA: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            source = recoverEscape(fragment)[1:-1]
            tokens.append(
                {
                    "type": "string",
                    "detail": "singlequotes",
                    "source": source.replace("\\\n", ""),
                    "id": parseUniqueId,
                    "line": parseLine,
                    "column": parseColumn,
                }
            )
            newLines = source.count("\\\n")
            parseLine += newLines
            if newLines:
                parseColumn = len(source) - source.rfind("\\\n") + 2
            else:
                parseColumn += len(source) + 2

        # Handle string
        elif R_STRING_B.match(fragment):
            # print "Type:StringB: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            source = recoverEscape(fragment)[1:-1]
            tokens.append(
                {
                    "type": "string",
                    "detail": "doublequotes",
                    "source": source.replace("\\\n", ""),
                    "id": parseUniqueId,
                    "line": parseLine,
                    "column": parseColumn,
                }
            )
            newLines = source.count("\\\n")
            parseLine += newLines
            if newLines:
                parseColumn = len(source) - source.rfind("\\\n") + 2
            else:
                parseColumn += len(source) + 2

        # Handle float num
        elif R_FLOAT.match(fragment):
            # print "Type:Float: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            tokens.append(
                {
                    "type": "number",
                    "detail": "float",
                    "source": fragment,
                    "id": parseUniqueId,
                    "line": parseLine,
                    "column": parseColumn,
                }
            )

        # Handle regexps
        # elif R_REGEXP.search(content[:content.index('\n')]):
        #    mo = R_REGEXP.search(content)
        #    regmatch = mo.group(0)
        #    content = parseFragmentLead(content, regmatch, tokens)
        #    tokens.append({ "type" : "regexp", "detail" : "", "source" : recoverEscape(regmatch), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })
        #    parseColumn += len(regmatch)

        # Handle operator
        elif R_OPERATORS.match(fragment):
            # print "Type:Operator: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            tokens.append(
                {
                    "type": "token",
                    "detail": lang.TOKENS[fragment],
                    "source": fragment,
                    "id": parseUniqueId,
                    "line": parseLine,
                    "column": parseColumn,
                }
            )

        # Handle everything else
        else:
            fragresult = R_REGEXP.search(fragment)

            if fragresult:
                # print "Type:RegExp: %s" % fragresult.group(0)

                if (
                    R_REGEXP_A.match(fragment)
                    or R_REGEXP_B.match(fragment)
                    or R_REGEXP_C.match(fragment)
                    or R_REGEXP_D.match(fragment)
                    or R_REGEXP_E.match(fragment)
                ):
                    content = parseFragmentLead(content, fragresult.group(0), tokens)
                    tokens.append(
                        {
                            "type": "regexp",
                            "detail": "",
                            "source": recoverEscape(fragresult.group(0)),
                            "id": parseUniqueId,
                            "line": parseLine,
                            "column": parseColumn,
                        }
                    )

                else:
                    print "Bad regular expression: %s" % fragresult.group(0)

            else:
                print "Type:None!"

    # tokens.extend(parsePart(recoverEscape(content)))
    parsePart(recoverEscape(content), tokens)
    tokens.append(
        {"type": "eof", "source": "", "detail": "", "id": parseUniqueId, "line": parseLine, "column": parseColumn}
    )

    return tokens
Exemple #3
0
def parseStream(content, uniqueId=""):
    tokens = []
    line = column = sol = 1
    scanner = Scanner.LQueue(tokens_2_obj(content))
    for tok in scanner:
        # tok isinstanceof Scanner.Token()
        token = {"source": tok.value, "detail": "", "line": line, "column": tok.spos - sol + 1, "id": uniqueId}

        # white space
        if tok.name == "white":
            continue

        # end of file
        elif tok.name == "eof":
            token["type"] = "eof"

        # line break
        elif tok.name == "nl":
            token["type"] = "eol"
            token["source"] = ""  # that's the way the old tokenizer does it
            line += 1  # increase line count
            sol = tok.spos + tok.len  # char pos of next line start

        # float
        elif tok.name == "float":
            token["type"] = "number"
            token["detail"] = "float"

        # hex integer
        elif tok.name == "hexnum":
            token["type"] = "number"
            token["detail"] = "int"

        # integer
        elif tok.name == "number":
            token["type"] = "number"
            token["detail"] = "int"

        # string
        elif tok.value in ('"', "'"):
            # accumulate strings
            token["type"] = "string"
            if tok.value == '"':
                token["detail"] = "doublequotes"
            else:
                token["detail"] = "singlequotes"
            token["source"] = parseString(scanner, tok.value)
            token["source"] = token["source"][:-1]

        # identifier, operator
        elif tok.name in ("ident", "op", "mulop"):

            # JS operator symbols
            if tok.value in lang.TOKENS:
                # division, div-assignment, regexp
                if tok.value in ("/", "/="):
                    # accumulate regex literals
                    if len(tokens) == 0 or (
                        (tokens[-1]["type"] != "number")
                        and (tokens[-1]["detail"] != "RP")
                        and (tokens[-1]["detail"] != "RB")
                        and (tokens[-1]["type"] != "name")
                    ):
                        regexp = parseRegexp(scanner)
                        token["type"] = "regexp"
                        token["source"] = tok.value + regexp
                    else:
                        token["type"] = "token"
                        token["detail"] = lang.TOKENS[tok.value]

                # comment, inline
                elif tok.value == "//":
                    # accumulate inline comments
                    if len(tokens) == 0 or not is_last_escaped_token(tokens):
                        commnt = parseCommentI(scanner)
                        token["type"] = "comment"
                        token["source"] = tok.value + commnt
                        token["begin"] = not hasLeadingContent(tokens)
                        token["end"] = True
                        token["connection"] = (
                            "before" if token["begin"] else "after"
                        )  # "^//...\n i=1;" => comment *before* code; "i=1; //..." => comment *after* code
                        token["multiline"] = False
                        token["detail"] = "inline"
                    else:
                        print >>sys.stderror, "Inline comment out of context"

                # comment, multiline
                elif tok.value == "/*":
                    # accumulate multiline comments
                    if len(tokens) == 0 or not is_last_escaped_token(tokens):
                        commnt = parseCommentM(scanner)
                        token["type"] = "comment"
                        token["source"] = tok.value + commnt
                        token["detail"] = comment.getFormat(token["source"])
                        token["begin"] = not hasLeadingContent(tokens)
                        if restLineIsEmpty(scanner):
                            token["end"] = True
                        else:
                            token["end"] = False
                        if token["begin"]:
                            token["source"] = comment.outdent(token["source"], column - 1)
                        token["source"] = comment.correct(token["source"])
                        if token["end"] and not token["begin"]:
                            token["connection"] = "after"
                        else:
                            token["connection"] = "before"
                        # adapt line number
                        linecnt = len(re.findall("\n", token["source"]))
                        if linecnt > 0:
                            line += linecnt
                            token["multiline"] = True
                        else:
                            token["multiline"] = False

                    else:
                        print >>sys.stderror, "Multiline comment out of context"

                # every other operator goes as is
                else:
                    token["type"] = "token"
                    token["detail"] = lang.TOKENS[tok.value]

            # JS keywords
            elif tok.value in lang.RESERVED:
                token["type"] = "reserved"
                token["detail"] = lang.RESERVED[tok.value]

            # JS/BOM objects
            elif tok.value in lang.BUILTIN:
                token["type"] = "builtin"

            # identifier
            elif tok.value.startswith("__"):
                token["type"] = "name"
                token["detail"] = "private"
            elif tok.value.startswith("_"):
                token["type"] = "name"
                token["detail"] = "protected"
            else:
                token["type"] = "name"
                token["detail"] = "public"

        # unknown token
        else:
            print >>sys.stderr, "Unhandled lexem: %s" % tok
            pass

        tokens.append(token)
    return tokens
Exemple #4
0
def parseStream(content, uniqueId=""):
    tokens = []
    line = column = 1
    sol = 0  # index of start-of-line
    scanner = Scanner.LQueue(tokens_2_obj(content))
    scanner.content = content
    scanner.slice = scanner_slice
    for tok in scanner:
        # some inital values (tok isinstanceof Scanner.Token())
        token = {"source": tok.value, "detail": "", "line": line, "column": tok.spos - sol + 1, "id": uniqueId}

        # white space
        if tok.name == "white":
            continue

        # end of file
        elif tok.name == "eof":
            token["type"] = "eof"

        # line break
        elif tok.name == "nl":
            token["type"] = "eol"
            token["source"] = ""  # that's the way the old tokenizer does it
            line += 1  # increase line count
            sol = tok.spos + tok.len  # char pos of next line start

        # float
        elif tok.name == "float":
            token["type"] = "number"
            token["detail"] = "float"

        # hex integer
        elif tok.name == "hexnum":
            token["type"] = "number"
            token["detail"] = "int"

        # integer
        elif tok.name == "number":
            token["type"] = "number"
            token["detail"] = "int"

        # string
        elif tok.value in ('"', "'"):
            # accumulate strings
            token["type"] = "string"
            if tok.value == '"':
                token["detail"] = "doublequotes"
            else:
                token["detail"] = "singlequotes"
            try:
                token["source"] = parseString(scanner, tok.value)
            except SyntaxException, e:
                desc = e.args[0] + " starting with %r..." % (tok.value + e.args[1])[:20]
                raiseSyntaxException(token, desc)
            token["source"] = token["source"][:-1]
            # adapt line number -- this assumes multi-line strings are not generally out
            linecnt = len(re.findall("\n", token["source"]))
            if linecnt > 0:
                line += linecnt

        # identifier, operator
        elif tok.name in ("ident", "op", "mulop"):

            # JS operator symbols
            if tok.value in lang.TOKENS:
                # division, div-assignment, regexp
                if tok.value in ("/", "/="):
                    # accumulate regex literals
                    if len(tokens) == 0 or (
                        (tokens[-1]["type"] != "number")
                        and (tokens[-1]["detail"] != "RP")
                        and (tokens[-1]["detail"] != "RB")
                        and (tokens[-1]["type"] != "name")
                    ):
                        regexp = parseRegexp(scanner)
                        token["type"] = "regexp"
                        token["source"] = tok.value + regexp
                    else:
                        token["type"] = "token"
                        token["detail"] = lang.TOKENS[tok.value]

                # comment, inline
                elif tok.value == "//":
                    # accumulate inline comments
                    if len(tokens) == 0 or not is_last_escaped_token(tokens):
                        commnt = parseCommentI(scanner)
                        token["type"] = "comment"
                        token["source"] = tok.value + commnt
                        token["begin"] = not hasLeadingContent(tokens)
                        token["end"] = True
                        token["connection"] = (
                            "before" if token["begin"] else "after"
                        )  # "^//...\n i=1;" => comment *before* code; "i=1; //..." => comment *after* code
                        token["multiline"] = False
                        token["detail"] = "inline"
                    else:
                        print >>sys.stderror, "Inline comment out of context"

                # comment, multiline
                elif tok.value == "/*":
                    # accumulate multiline comments
                    if len(tokens) == 0 or not is_last_escaped_token(tokens):
                        token["type"] = "comment"
                        try:
                            commnt = parseCommentM(scanner)
                        except SyntaxException, e:
                            desc = e.args[0] + ' starting with "%r..."' % (tok.value + e.args[1])[:20]
                            raiseSyntaxException(token, desc)
                        commnt = alignMultiLines(commnt, token["column"])
                        token["source"] = tok.value + commnt
                        token["detail"] = comment.getFormat(token["source"])
                        token["begin"] = not hasLeadingContent(tokens)
                        if restLineIsEmpty(scanner):
                            token["end"] = True
                        else:
                            token["end"] = False
                        if token["begin"]:
                            token["source"] = comment.outdent(token["source"], column - 1)
                        token["source"] = comment.correct(token["source"])
                        if token["end"] and not token["begin"]:
                            token["connection"] = "after"
                        else:
                            token["connection"] = "before"
                        # adapt line number
                        linecnt = len(re.findall("\n", token["source"]))
                        if linecnt > 0:
                            line += linecnt
                            token["multiline"] = True
                        else:
                            token["multiline"] = False

                    else:
                        print >>sys.stderror, "Multiline comment out of context"

                # every other operator goes as is
                else:
                    token["type"] = "token"
                    token["detail"] = lang.TOKENS[tok.value]
Exemple #5
0
def parseStream(content, uniqueId=""):
    tokens = []
    line = column = 1
    sol = 0  # index of start-of-line
    scanner = Scanner.LQueue(tokens_2_obj(content, ))
    scanner.content = content
    scanner.slice = scanner_slice
    for tok in scanner:
        # some inital values (tok isinstanceof Scanner.Token())
        token = {
            "source": tok.value,
            "detail": "",
            "line": line,
            "column": tok.spos - sol + 1,
            "id": uniqueId
        }

        # white space
        if (tok.name == 'white'):
            continue

        # end of file
        elif tok.name == 'eof':
            token['type'] = 'eof'

        # line break
        elif tok.name == 'nl':
            token['type'] = 'eol'
            token['source'] = ''  # that's the way the old tokenizer does it
            line += 1  # increase line count
            sol = tok.spos + tok.len  # char pos of next line start

        # float
        elif tok.name == 'float':
            token['type'] = 'number'
            token['detail'] = 'float'

        # hex integer
        elif tok.name == 'hexnum':
            token['type'] = 'number'
            token['detail'] = 'int'

        # integer
        elif tok.name == 'number':
            token['type'] = 'number'
            token['detail'] = 'int'

        # string
        elif tok.value in ('"', "'"):
            # accumulate strings
            token['type'] = 'string'
            if tok.value == '"':
                token['detail'] = 'doublequotes'
            else:
                token['detail'] = 'singlequotes'
            try:
                token['source'] = parseString(scanner, tok.value)
            except SyntaxException, e:
                desc = e.args[0] + " starting with %r..." % (tok.value +
                                                             e.args[1])[:20]
                raiseSyntaxException(token, desc)
            token['source'] = token['source'][:-1]
            # adapt line number -- this assumes multi-line strings are not generally out
            linecnt = len(re.findall("\n", token['source']))
            if linecnt > 0:
                line += linecnt

        # identifier, operator
        elif tok.name in ("ident", "op", "mulop"):

            # JS operator symbols
            if tok.value in lang.TOKENS:
                # division, div-assignment, regexp
                if tok.value in ('/', '/='):
                    # accumulate regex literals
                    if (len(tokens) == 0
                            or ((tokens[-1]['type'] != 'number') and
                                (tokens[-1]['detail'] != 'RP') and
                                (tokens[-1]['detail'] != 'RB') and
                                (tokens[-1]['type'] != 'name'))):
                        regexp = parseRegexp(scanner)
                        token['type'] = 'regexp'
                        token['source'] = tok.value + regexp
                    else:
                        token['type'] = 'token'
                        token['detail'] = lang.TOKENS[tok.value]

                # comment, inline
                elif tok.value == '//':
                    # accumulate inline comments
                    if (len(tokens) == 0 or not is_last_escaped_token(tokens)):
                        commnt = parseCommentI(scanner)
                        token['type'] = 'comment'
                        token['source'] = tok.value + commnt
                        token['begin'] = not hasLeadingContent(tokens)
                        token['end'] = True
                        token['connection'] = "before" if token[
                            'begin'] else "after"  # "^//...\n i=1;" => comment *before* code; "i=1; //..." => comment *after* code
                        token['multiline'] = False
                        token['detail'] = 'inline'
                    else:
                        print >> sys.stderror, "Inline comment out of context"

                # comment, multiline
                elif tok.value == '/*':
                    # accumulate multiline comments
                    if (len(tokens) == 0 or not is_last_escaped_token(tokens)):
                        token['type'] = 'comment'
                        try:
                            commnt = parseCommentM(scanner)
                        except SyntaxException, e:
                            desc = e.args[0] + " starting with \"%r...\"" % (
                                tok.value + e.args[1])[:20]
                            raiseSyntaxException(token, desc)
                        commnt = alignMultiLines(commnt, token['column'])
                        token['source'] = tok.value + commnt
                        token['detail'] = comment.getFormat(token['source'])
                        token['begin'] = not hasLeadingContent(tokens)
                        if restLineIsEmpty(scanner):
                            token['end'] = True
                        else:
                            token['end'] = False
                        if token['begin']:
                            token['source'] = comment.outdent(
                                token['source'], column - 1)
                        token['source'] = comment.correct(token['source'])
                        if token['end'] and not token['begin']:
                            token['connection'] = "after"
                        else:
                            token['connection'] = "before"
                        # adapt line number
                        linecnt = len(re.findall("\n", token['source']))
                        if linecnt > 0:
                            line += linecnt
                            token['multiline'] = True
                        else:
                            token['multiline'] = False

                    else:
                        print >> sys.stderror, "Multiline comment out of context"

                # every other operator goes as is
                else:
                    token['type'] = 'token'
                    token['detail'] = lang.TOKENS[tok.value]
Exemple #6
0
def parseStream(content, uniqueId=""):
    # make global variables available
    global parseLine
    global parseColumn
    global parseUniqueId

    # reset global stuff
    parseColumn = 1
    parseLine = 1
    parseUniqueId = uniqueId

    # prepare storage
    tokens = []
    content = protectEscape(content)

    # print "      * searching for patterns..."
    try:
        all = R_ALL.findall(content)
    except RuntimeError:
        print "Could not parse file %s" % uniqueId
        print "Generally this means that there is a syntactial problem with your source-code."
        print "Please omit the usage of nested comments like '/* foo /* bar */'."
        sys.exit(1)

    # print "      * structuring..."

    #for item in all:
    #    if type(item) != types.TupleType:   # item's no longer a tuple!
    #        item = (item,)
    #    fragment = item[0]

    while content:
        mo = R_ALL.search(content)
        if mo:
            fragment = mo.group(0)
        else:
            break

        # print "Found: '%s'" % fragment

        # Handle block comment
        if comment.R_BLOCK_COMMENT.match(fragment):
            source = recoverEscape(fragment)
            format = comment.getFormat(source)
            multiline = comment.isMultiLine(source)

            # print "Type:MultiComment"
            content = parseFragmentLead(content, fragment,
                                        tokens)  # sort of intelligent "pop"

            atBegin = not hasLeadingContent(tokens)
            if re.compile("^\s*\n").search(content):
                atEnd = True
            else:
                atEnd = False

            # print "Begin: %s, End: %s" % (atBegin, atEnd)

            # Fixing source content
            if atBegin:
                source = comment.outdent(source, parseColumn - 1)

            source = comment.correct(source)

            connection = "before"

            if atEnd and not atBegin:
                connection = "after"
            else:
                connection = "before"

            tokens.append({
                "type": "comment",
                "detail": format,
                "multiline": multiline,
                "connection": connection,
                "source": source,
                "id": parseUniqueId,
                "line": parseLine,
                "column": parseColumn,
                "begin": atBegin,
                "end": atEnd
            })
            parseLine += len(fragment.split("\n")) - 1

        # Handle inline comment
        elif comment.R_INLINE_COMMENT.match(fragment):
            # print "Type:SingleComment"
            source = recoverEscape(fragment)
            content = parseFragmentLead(content, fragment, tokens)

            atBegin = hasLeadingContent(tokens)
            atEnd = True

            if atBegin:
                connection = "after"
            else:
                connection = "before"

            source = comment.correct(source)

            tokens.append({
                "type": "comment",
                "detail": "inline",
                "multiline": False,
                "connection": connection,
                "source": source,
                "id": parseUniqueId,
                "line": parseLine,
                "column": parseColumn,
                "begin": atBegin,
                "end": atEnd
            })

        # Handle string
        elif R_STRING_A.match(fragment):
            # print "Type:StringA: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            source = recoverEscape(fragment)[1:-1]
            tokens.append({
                "type": "string",
                "detail": "singlequotes",
                "source": source.replace("\\\n", ""),
                "id": parseUniqueId,
                "line": parseLine,
                "column": parseColumn
            })
            newLines = source.count("\\\n")
            parseLine += newLines
            if newLines:
                parseColumn = len(source) - source.rfind("\\\n") + 2
            else:
                parseColumn += len(source) + 2

        # Handle string
        elif R_STRING_B.match(fragment):
            # print "Type:StringB: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            source = recoverEscape(fragment)[1:-1]
            tokens.append({
                "type": "string",
                "detail": "doublequotes",
                "source": source.replace("\\\n", ""),
                "id": parseUniqueId,
                "line": parseLine,
                "column": parseColumn
            })
            newLines = source.count("\\\n")
            parseLine += newLines
            if newLines:
                parseColumn = len(source) - source.rfind("\\\n") + 2
            else:
                parseColumn += len(source) + 2

        # Handle float num
        elif R_FLOAT.match(fragment):
            # print "Type:Float: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            tokens.append({
                "type": "number",
                "detail": "float",
                "source": fragment,
                "id": parseUniqueId,
                "line": parseLine,
                "column": parseColumn
            })

        # Handle regexps
        #elif R_REGEXP.search(content[:content.index('\n')]):
        #    mo = R_REGEXP.search(content)
        #    regmatch = mo.group(0)
        #    content = parseFragmentLead(content, regmatch, tokens)
        #    tokens.append({ "type" : "regexp", "detail" : "", "source" : recoverEscape(regmatch), "id" : parseUniqueId, "line" : parseLine, "column" : parseColumn })
        #    parseColumn += len(regmatch)

        # Handle operator
        elif R_OPERATORS.match(fragment):
            # print "Type:Operator: %s" % fragment
            content = parseFragmentLead(content, fragment, tokens)
            tokens.append({
                "type": "token",
                "detail": lang.TOKENS[fragment],
                "source": fragment,
                "id": parseUniqueId,
                "line": parseLine,
                "column": parseColumn
            })

        # Handle everything else
        else:
            fragresult = R_REGEXP.search(fragment)

            if fragresult:
                # print "Type:RegExp: %s" % fragresult.group(0)

                if R_REGEXP_A.match(fragment) or R_REGEXP_B.match(
                        fragment) or R_REGEXP_C.match(
                            fragment) or R_REGEXP_D.match(
                                fragment) or R_REGEXP_E.match(fragment):
                    content = parseFragmentLead(content, fragresult.group(0),
                                                tokens)
                    tokens.append({
                        "type": "regexp",
                        "detail": "",
                        "source": recoverEscape(fragresult.group(0)),
                        "id": parseUniqueId,
                        "line": parseLine,
                        "column": parseColumn
                    })

                else:
                    print "Bad regular expression: %s" % fragresult.group(0)

            else:
                print "Type:None!"

    #tokens.extend(parsePart(recoverEscape(content)))
    parsePart(recoverEscape(content), tokens)
    tokens.append({
        "type": "eof",
        "source": "",
        "detail": "",
        "id": parseUniqueId,
        "line": parseLine,
        "column": parseColumn
    })

    return tokens