コード例 #1
0
def segmentizeLine( line, segmentEndPunctuation='.?!;:' ):
    """
    Break the line into segments (like sentences that should match across the translations)
        and then break each segment into words.

    If you want case folding, convert line to lowerCase before calling.

    Set segmentEndPunctuation to None if you don't want the lines further divided.

    Returns a list of lists of words.
    """
    if BibleOrgSysGlobals.debugFlag:
        if debuggingThisModule:
            print( exp("segmentizeLine( {!r} )").format( line ) )

    if segmentEndPunctuation:
        for segmentEndChar in segmentEndPunctuation:
            line = line.replace( segmentEndChar, 'SsSsSsS' )
    line = line.replace('—',' ').replace('–',' ') # Treat em-dash and en-dash as word break characters


    lineList = []
    for segment in line.split( 'SsSsSsS' ):
        segmentList = []
        for rawWord in segment.split():
            word = rawWord
            for internalMarker in BibleOrgSysGlobals.internal_SFMs_to_remove: word = word.replace( internalMarker, '' )
            word = BibleOrgSysGlobals.stripWordPunctuation( word )
            if word and not word[0].isalnum():
                #print( "not alnum", repr(rawWord), repr(word) )
                if len(word) > 1:
                    if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
                        print( "segmentizeLine: {} {}:{} ".format( self.BBB, C, V ) \
                                            + _("Have unexpected character starting word {!r}").format( word ) )
                    word = word[1:]
            if word: # There's still some characters remaining after all that stripping
                #print( "here", repr(rawWord), repr(word) )
                if 1 or BibleOrgSysGlobals.verbosityLevel > 3: # why???
                    for k,char in enumerate(word):
                        if not char.isalnum() and (k==0 or k==len(word)-1 or char not in BibleOrgSysGlobals.MEDIAL_WORD_PUNCT_CHARS):
                            if BibleOrgSysGlobals.debugFlag and debuggingThisModule:
                                print( "segmentizeLine: {} {}:{} ".format( self.BBB, C, V ) + _("Have unexpected {!r} in word {!r}").format( char, word ) )
                lcWord = word.lower()
                isAReferenceOrNumber = True
                for char in word:
                    if not char.isdigit() and char not in ':-,.': isAReferenceOrNumber = False; break
                if not isAReferenceOrNumber:
                    segmentList.append( word )
                    #lDict['allWordCounts'][word] = 1 if word not in lDict['allWordCounts'] else lDict['allWordCounts'][word] + 1
                    #lDict['allCaseInsensitiveWordCounts'][lcWord] = 1 if lcWord not in lDict['allCaseInsensitiveWordCounts'] else lDict['allCaseInsensitiveWordCounts'][lcWord] + 1
        lineList.append( segmentList )

    #print( '  lineList', lineList )
    return lineList