Python TokenGen, shorten Examples

Programming Language: Python

Class/Type: TokenGen

Examples at hotexamples.com: 4

Python TokenGen - 4 examples found. These are the top rated real world Python examples of TokenGen from package shorten extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Basic(1)

Hyphen(1)

break_hyphens(1)

keep_hyphens(1)

Example #1

Show file

def GetScore(Text,Lexicon,Rules=set(),verbose=False,hyphen=False):

    CapsMatch = 0
    CapsSub = 0
    CapsCount = 0

    LowMatch = 0
    LowSub = 0
    LowCount = 0
    
    if verbose:
        print("Attempting token matching")
        if Rules == set():
            print("No substitution rules loaded")

## If not asked to check for possible fragmented matches, use basic checker.
## Function will default to basic checker.

    if hyphen == False:
        Tokens = TokenGen.Basic(Text,verbose)
    else:
        Tokens = TokenGen.Hyphen(Text,Lexicon,Rules,verbose)

## Maintains separate scores for substitution if rules were passed in, as well
## as separate scores for capitals, lowercase

    for word in Tokens:
        if word.islower():
            LowCount = LowCount + 1
            if word in Lexicon:
                LowMatch = LowMatch + 1
            elif len(Rules) >= 1 and word in Rules:
                LowSub = LowSub + 1                
        else:
            CapsCount = CapsCount + 1
            if word in Lexicon:
                CapsMatch = CapsMatch + 1
            elif len(Rules) >= 1 and word in Rules:
                CapsSub = CapsSub + 1

    if verbose:
        print("\t" + str(CapsCount) + " total capitalized tokens")
        print("\t" + str(CapsMatch) + " total capitalized dictionary matches")
        print("\t" + str(CapsSub) + " total capitalized valid substitutions")
        print("\t" + str(LowCount) + " total lower-case tokens")
        print("\t" + str(LowMatch) + " total lower-case dictionary matches")
        print("\t" + str(LowSub) + " total lower-case valid substitutions\n")

## Return the six scores as a tuple.

    return (CapsCount,CapsMatch,CapsSub,LowCount,LowMatch,LowSub)

Example #2

Show file

File: AccEval.py Project: tedunderwood/ocreval

def GetScore(Text,Lexicon,Rules=set(),verbose=False,hyphen=False):

    CapsMatch = 0
    CapsSub = 0
    CapsCount = 0

    LowMatch = 0
    LowSub = 0
    LowCount = 0
    
    if verbose:
        print("Attempting token matching")
        if Rules == set():
            print("No substitution rules loaded")

## If not asked to check for possible fragmented matches, use basic checker.
## Function will default to basic checker.

    
    Tokens = TokenGen.break_hyphens(Text,Lexicon,Rules,verbose)

## Maintains separate scores for substitution if rules were passed in, as well
## as separate scores for capitals, lowercase. Note that tokens like
## "wiU" (will) are not counted as capitalized.

    for word in Tokens:
        LowerWord = word.lower()
        if word[0].islower():
            LowCount = LowCount + 1
            if LowerWord in Lexicon:
                LowMatch = LowMatch + 1
            elif len(Rules) >= 1 and LowerWord in Rules:
                LowSub = LowSub + 1                
        else:
            CapsCount = CapsCount + 1
            if LowerWord in Lexicon:
                CapsMatch = CapsMatch + 1
            elif len(Rules) >= 1 and LowerWord in Rules:
                CapsSub = CapsSub + 1

    if verbose:
        print("\t" + str(CapsCount) + " total capitalized tokens")
        print("\t" + str(CapsMatch) + " total capitalized dictionary matches")
        print("\t" + str(CapsSub) + " total capitalized valid substitutions")
        print("\t" + str(LowCount) + " total lower-case tokens")
        print("\t" + str(LowMatch) + " total lower-case dictionary matches")
        print("\t" + str(LowSub) + " total lower-case valid substitutions\n")

## Return the six scores as a tuple.

    return (CapsCount,CapsMatch,CapsSub,LowCount,LowMatch,LowSub)

Example #3

Show file

    IDtoprocess = HTIDlist[index].strip()
    filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath)
    filename = filepath + postfix + '/' + postfix + ".txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(IDtoprocess + " is missing.")
        continue

    tokens = TokenGen.keep_hyphens(lines, Lexicon, verbose=debug)

    volacc = TypeIndex.GetAcc(tokens, Lexicon, debug)

    types = TypeIndex.GetTypes(tokens, verbose=debug)

    TypeIndex.UpdateIndex(BigIndex, types, volacc, debug)

### Deletes BigIndex after copying to list in order to save memory

SortedIndex = TypeIndex.SortIndex(BigIndex, debug)

del BigIndex

TypeIndex.WriteIndex(SortedIndex, outputpath + writename, delim, debug)

Example #4

Show file

File: SliceIndexer.py Project: Anterotesis/DataMunging

    IDtoprocess = IDtoprocess.strip()
    filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath)
    filename = filepath + postfix + '/' + postfix + ".txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(IDtoprocess + " is missing.")
        continue
        
    tokens = TokenGen.keep_hyphens(lines,Lexicon,verbose=debug)

    if len(tokens) < 10:
        print(IDtoprocess, "has only tokencount", len(tokens))

    volacc = TypeIndex.GetAcc(tokens,Lexicon,debug)

    types = TypeIndex.GetTypes(tokens,verbose=debug)

    TypeIndex.UpdateIndex(BigIndex, types, volacc, debug)

### Deletes BigIndex after copying to list in order to save memory

SortedIndex = TypeIndex.SortIndex(BigIndex, debug)

del BigIndex