Example #1
0
def GetScore(Text,Lexicon,Rules=set(),verbose=False,hyphen=False):

    CapsMatch = 0
    CapsSub = 0
    CapsCount = 0

    LowMatch = 0
    LowSub = 0
    LowCount = 0
    
    if verbose:
        print("Attempting token matching")
        if Rules == set():
            print("No substitution rules loaded")

## If not asked to check for possible fragmented matches, use basic checker.
## Function will default to basic checker.

    if hyphen == False:
        Tokens = TokenGen.Basic(Text,verbose)
    else:
        Tokens = TokenGen.Hyphen(Text,Lexicon,Rules,verbose)

## Maintains separate scores for substitution if rules were passed in, as well
## as separate scores for capitals, lowercase

    for word in Tokens:
        if word.islower():
            LowCount = LowCount + 1
            if word in Lexicon:
                LowMatch = LowMatch + 1
            elif len(Rules) >= 1 and word in Rules:
                LowSub = LowSub + 1                
        else:
            CapsCount = CapsCount + 1
            if word in Lexicon:
                CapsMatch = CapsMatch + 1
            elif len(Rules) >= 1 and word in Rules:
                CapsSub = CapsSub + 1

    if verbose:
        print("\t" + str(CapsCount) + " total capitalized tokens")
        print("\t" + str(CapsMatch) + " total capitalized dictionary matches")
        print("\t" + str(CapsSub) + " total capitalized valid substitutions")
        print("\t" + str(LowCount) + " total lower-case tokens")
        print("\t" + str(LowMatch) + " total lower-case dictionary matches")
        print("\t" + str(LowSub) + " total lower-case valid substitutions\n")

## Return the six scores as a tuple.

    return (CapsCount,CapsMatch,CapsSub,LowCount,LowMatch,LowSub)
Example #2
0
def GetScore(Text,Lexicon,Rules=set(),verbose=False,hyphen=False):

    CapsMatch = 0
    CapsSub = 0
    CapsCount = 0

    LowMatch = 0
    LowSub = 0
    LowCount = 0
    
    if verbose:
        print("Attempting token matching")
        if Rules == set():
            print("No substitution rules loaded")

## If not asked to check for possible fragmented matches, use basic checker.
## Function will default to basic checker.

    
    Tokens = TokenGen.break_hyphens(Text,Lexicon,Rules,verbose)

## Maintains separate scores for substitution if rules were passed in, as well
## as separate scores for capitals, lowercase. Note that tokens like
## "wiU" (will) are not counted as capitalized.

    for word in Tokens:
        LowerWord = word.lower()
        if word[0].islower():
            LowCount = LowCount + 1
            if LowerWord in Lexicon:
                LowMatch = LowMatch + 1
            elif len(Rules) >= 1 and LowerWord in Rules:
                LowSub = LowSub + 1                
        else:
            CapsCount = CapsCount + 1
            if LowerWord in Lexicon:
                CapsMatch = CapsMatch + 1
            elif len(Rules) >= 1 and LowerWord in Rules:
                CapsSub = CapsSub + 1

    if verbose:
        print("\t" + str(CapsCount) + " total capitalized tokens")
        print("\t" + str(CapsMatch) + " total capitalized dictionary matches")
        print("\t" + str(CapsSub) + " total capitalized valid substitutions")
        print("\t" + str(LowCount) + " total lower-case tokens")
        print("\t" + str(LowMatch) + " total lower-case dictionary matches")
        print("\t" + str(LowSub) + " total lower-case valid substitutions\n")

## Return the six scores as a tuple.

    return (CapsCount,CapsMatch,CapsSub,LowCount,LowMatch,LowSub)
Example #3
0
    IDtoprocess = HTIDlist[index].strip()
    filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath)
    filename = filepath + postfix + '/' + postfix + ".txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(IDtoprocess + " is missing.")
        continue

    tokens = TokenGen.keep_hyphens(lines, Lexicon, verbose=debug)

    volacc = TypeIndex.GetAcc(tokens, Lexicon, debug)

    types = TypeIndex.GetTypes(tokens, verbose=debug)

    TypeIndex.UpdateIndex(BigIndex, types, volacc, debug)

### Deletes BigIndex after copying to list in order to save memory

SortedIndex = TypeIndex.SortIndex(BigIndex, debug)

del BigIndex

TypeIndex.WriteIndex(SortedIndex, outputpath + writename, delim, debug)
Example #4
0
    IDtoprocess = IDtoprocess.strip()
    filepath, postfix = FileCabinet.pairtreepath(IDtoprocess, datapath)
    filename = filepath + postfix + '/' + postfix + ".txt"

    try:
        with open(filename, encoding='utf-8') as file:
            lines = file.readlines()
            successflag = True
    except IOError as e:
        successflag = False

    if not successflag:
        print(IDtoprocess + " is missing.")
        continue
        
    tokens = TokenGen.keep_hyphens(lines,Lexicon,verbose=debug)

    if len(tokens) < 10:
        print(IDtoprocess, "has only tokencount", len(tokens))

    volacc = TypeIndex.GetAcc(tokens,Lexicon,debug)

    types = TypeIndex.GetTypes(tokens,verbose=debug)

    TypeIndex.UpdateIndex(BigIndex, types, volacc, debug)

### Deletes BigIndex after copying to list in order to save memory

SortedIndex = TypeIndex.SortIndex(BigIndex, debug)

del BigIndex