Esempio n. 1
0
def get_hom_in_file(FSr,
                    JsonFP,
                    UpTo=100000,
                    FstPosition=0,
                    AssumeSortedP=False):
    #print('trying to find homs for '+TgtHom)
    # Fnd=False;FndCnt=0
    OrthsVecs = defaultdict(list)
    #    TgtHomRegex='["'+TgtHom
    FSr, Chunk, LineCnt, _ = myModule.pop_chunk_from_stream(FSr,
                                                            Pattern=',',
                                                            Type='cont')
    if not Chunk or not FSr:
        return None
    MultiToks = []
    for Line in Chunk.strip().split('\n'):
        HomVecs = json.loads(Line)
        PronCat, Ind, Len, Toks, Vec = HomVecs
        Orth = ''.join(Toks[Ind:Ind + Len])
        if Len >= 2:
            MultiToks.append(Orth)
        OrthsVecs[Orth].append(Vec)

#   print(str(Cntr+1)+' found')
    return FSr, OrthsVecs, PronCat, LineCnt, MultiToks
Esempio n. 2
0
def mark_sents(FP,FtCnts,Recover=True,Output=None):
    #set_trace()
    '''
    def find_eof_errors(FP):
  
        FSr=open(FP)
        LstLiNe=myModule.readline_reverse(FSr)
        TrailEmptyLineCnt=0
        while LstLiNe.strip()=='':
            LstLine=myModule.readline_reverse(FSr)
            TrailEmptyLineCnt+=1
        
        if LstLine!='EOS':
            LstEOSP=False
        return (TrailEmptyCnt,LstEOSP)    
            
    '''        
  #      return MkdLines

    
    with open(FP,'rt',encoding='utf-8') as FSr:
       # (TrailEmptyCnt,LstEOSP)=find_eof_errors(FP)
       # if not Recover and not (TrailEmptyCnt and LstEOSP):
       #     sys.exit('there is an EOF error, either empty trailing lines or no EOS')
            
        extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS')

        MkdSents=[]; SentCnt=LineCnt=0; NextLine=True
        while NextLine:
            FSr,Sent,LineCntPerSent,NextLine=extract_chunk(FSr)
            if NextLine:
                LineCnt+=LineCntPerSent;SentCnt+=1
                if Sent.strip()=='':
                    if Recover:
                        #MkdSents.append([(Sent,None,'empty sent')])
                        yield [(Sent,None,'empty sent')]
                else:
                    if Debug:
                        print('Now at '+str(LineCnt))
                    MkdLines=mark_sentlines(Sent.strip().split('\n'),FtCnts,Recover=Recover)
                    #MkdSents.append(MkdLines)
                    yield MkdLines
Esempio n. 3
0
def get_hom_in_file(FSr,
                    JsonFP,
                    UpTo=100000,
                    FstPosition=0,
                    AssumeSortedP=False):
    #print('trying to find homs for '+TgtHom)
    # Fnd=False;FndCnt=0
    OrthsVecs = defaultdict(list)
    #    TgtHomRegex='["'+TgtHom
    FSr, Chunk, LineCnt, _ = myModule.pop_chunk_from_stream(FSr,
                                                            Pattern=',',
                                                            Type='cont')
    for Line in Chunk.strip().split('\n'):
        HomVecs = json.loads(Line)
        #assert(HomVecs[0]==TgtHom)
        Orth = HomVecs[2][HomVecs[1]]
        OrthsVecs[Orth].append(HomVecs[3])

#   print(str(Cntr+1)+' found')
    return FSr, OrthsVecs, HomVecs[0], LineCnt
Esempio n. 4
0
def extract_sentences(FileP,LineNums='all',ReturnRaw=False,Print=False):
    def chunkprocess(Chunk,ReturnRaw):
        if not ReturnRaw:
            return Chunk.strip().split('\n')
    FSr=open(FileP,'rt',encoding='utf-8')
    extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS')
    FSr,Chunk,_,NxtLine=extract_chunk(FSr)
    Sentl=False
    Cntr=0
    while not Sentl:
        Cntr+=1
        if LineNums=='all':
            yield chunkprocess(Chunk,ReturnRaw)
        else:
            if Cntr in LineNums:
                LineNums.remove(Cntr)
                yield chunkprocess(Chunk,ReturnRaw)

        FSr,Chunk,_,NxtLine=extract_chunk(FSr)
        if not LineNums or not NxtLine:
            Sentl=True
Esempio n. 5
0
def extract_sentences(FileP,LineNums='all',ReturnRaw=False,Print=False):
    def chunkprocess(Chunk,ReturnRaw):
        if not ReturnRaw:
            return Chunk.strip().split('\n')
    FSr=open(FileP,'rt')
    extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS')
    FSr,Chunk,_,NxtLine=extract_chunk(FSr)
    Sentl=False
    Cntr=0; Sents2Ext=[]
    while not Sentl:
        Cntr+=1
        if LineNums=='all':
            Sents2Ext.append(chunkprocess(Chunk,ReturnRaw))
        else:
            if Cntr in LineNums:
                LineNums.remove(Cntr)
                Sents2Ext.append(chunkprocess(Chunk,ReturnRaw))

        FSr,Chunk,_,NxtLine=extract_chunk(FSr)
        if not LineNums or not NxtLine:
            Sentl=True
    if Print: print(Sents2Ext)
    return Sents2Ext
Esempio n. 6
0
def mark_sents(FP,FtCnts,Recover=True,Output=None):
    #set_trace()
    '''
    def find_eof_errors(FP):
  
        FSr=open(FP)
        LstLiNe=myModule.readline_reverse(FSr)
        TrailEmptyLineCnt=0
        while LstLiNe.strip()=='':
            LstLine=myModule.readline_reverse(FSr)
            TrailEmptyLineCnt+=1
        
        if LstLine!='EOS':
            LstEOSP=False
        return (TrailEmptyCnt,LstEOSP)    
            
    '''        

    def mark_errors_sentlines(SentLines,FtCnts,SentCnt,FstLineNum,Recover=True):
        MkdLines=[]
        for (Cntr,Line) in enumerate(SentLines):
            Wrong=something_wrong_insideline(Line,FtCnts)
            if not Wrong:
                MkdLines.append((Line,Line,'original'))
            # below is when there is something wrong!!!
            else:
                if Recover:
                    print('error found ('+Wrong+' at '+str(SentCnt)+'/'+str(FstLineNum+Cntr+1)+'), attempting to recover')
                    # attempt to recover
                    Attempted=try_and_recover(Line,Wrong)
                    # it could return none, this is failure
                    if Attempted is None:
                        print('recovery failed')
                        MkdLines.append((Line,None,Wrong))
                    # it could return something where there still are errors
                    elif something_wrong_insideline(Attempted,FtCnts):
                        print('recovery failed')
                        MkdLines.append((Line,None,Wrong))
                    # otherwise it's success
                    else:
                        print('recovery successful')
                        MkdLines.append((Line,Attempted,'recovered'))
                else:
                    MkdLines.append((Line,None,Wrong))

        return MkdLines

    
    with open(FP,'rt') as FSr:
       # (TrailEmptyCnt,LstEOSP)=find_eof_errors(FP)
       # if not Recover and not (TrailEmptyCnt and LstEOSP):
       #     sys.exit('there is an EOF error, either empty trailing lines or no EOS')
            
        extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS')

        MkdSents=[]; SentCnt=LineCnt=0; NextLine=True
        while NextLine:
            FSr,Sent,LineCntPerSent,NextLine=extract_chunk(FSr)
            if NextLine:
                LineCnt+=LineCntPerSent;SentCnt+=1
                if Sent.strip()=='':
                    if Recover:
                        MkdSents.append([(Sent,None,'empty sent')])
                else:
                    MkdLines=mark_errors_sentlines(Sent.strip().split('\n'),FtCnts,SentCnt,LineCnt,Recover=Recover)
                    MkdSents.append(MkdLines)
                                       

    return MkdSents