def get_hom_in_file(FSr, JsonFP, UpTo=100000, FstPosition=0, AssumeSortedP=False): #print('trying to find homs for '+TgtHom) # Fnd=False;FndCnt=0 OrthsVecs = defaultdict(list) # TgtHomRegex='["'+TgtHom FSr, Chunk, LineCnt, _ = myModule.pop_chunk_from_stream(FSr, Pattern=',', Type='cont') if not Chunk or not FSr: return None MultiToks = [] for Line in Chunk.strip().split('\n'): HomVecs = json.loads(Line) PronCat, Ind, Len, Toks, Vec = HomVecs Orth = ''.join(Toks[Ind:Ind + Len]) if Len >= 2: MultiToks.append(Orth) OrthsVecs[Orth].append(Vec) # print(str(Cntr+1)+' found') return FSr, OrthsVecs, PronCat, LineCnt, MultiToks
def mark_sents(FP,FtCnts,Recover=True,Output=None): #set_trace() ''' def find_eof_errors(FP): FSr=open(FP) LstLiNe=myModule.readline_reverse(FSr) TrailEmptyLineCnt=0 while LstLiNe.strip()=='': LstLine=myModule.readline_reverse(FSr) TrailEmptyLineCnt+=1 if LstLine!='EOS': LstEOSP=False return (TrailEmptyCnt,LstEOSP) ''' # return MkdLines with open(FP,'rt',encoding='utf-8') as FSr: # (TrailEmptyCnt,LstEOSP)=find_eof_errors(FP) # if not Recover and not (TrailEmptyCnt and LstEOSP): # sys.exit('there is an EOF error, either empty trailing lines or no EOS') extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS') MkdSents=[]; SentCnt=LineCnt=0; NextLine=True while NextLine: FSr,Sent,LineCntPerSent,NextLine=extract_chunk(FSr) if NextLine: LineCnt+=LineCntPerSent;SentCnt+=1 if Sent.strip()=='': if Recover: #MkdSents.append([(Sent,None,'empty sent')]) yield [(Sent,None,'empty sent')] else: if Debug: print('Now at '+str(LineCnt)) MkdLines=mark_sentlines(Sent.strip().split('\n'),FtCnts,Recover=Recover) #MkdSents.append(MkdLines) yield MkdLines
def get_hom_in_file(FSr, JsonFP, UpTo=100000, FstPosition=0, AssumeSortedP=False): #print('trying to find homs for '+TgtHom) # Fnd=False;FndCnt=0 OrthsVecs = defaultdict(list) # TgtHomRegex='["'+TgtHom FSr, Chunk, LineCnt, _ = myModule.pop_chunk_from_stream(FSr, Pattern=',', Type='cont') for Line in Chunk.strip().split('\n'): HomVecs = json.loads(Line) #assert(HomVecs[0]==TgtHom) Orth = HomVecs[2][HomVecs[1]] OrthsVecs[Orth].append(HomVecs[3]) # print(str(Cntr+1)+' found') return FSr, OrthsVecs, HomVecs[0], LineCnt
def extract_sentences(FileP,LineNums='all',ReturnRaw=False,Print=False): def chunkprocess(Chunk,ReturnRaw): if not ReturnRaw: return Chunk.strip().split('\n') FSr=open(FileP,'rt',encoding='utf-8') extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS') FSr,Chunk,_,NxtLine=extract_chunk(FSr) Sentl=False Cntr=0 while not Sentl: Cntr+=1 if LineNums=='all': yield chunkprocess(Chunk,ReturnRaw) else: if Cntr in LineNums: LineNums.remove(Cntr) yield chunkprocess(Chunk,ReturnRaw) FSr,Chunk,_,NxtLine=extract_chunk(FSr) if not LineNums or not NxtLine: Sentl=True
def extract_sentences(FileP,LineNums='all',ReturnRaw=False,Print=False): def chunkprocess(Chunk,ReturnRaw): if not ReturnRaw: return Chunk.strip().split('\n') FSr=open(FileP,'rt') extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS') FSr,Chunk,_,NxtLine=extract_chunk(FSr) Sentl=False Cntr=0; Sents2Ext=[] while not Sentl: Cntr+=1 if LineNums=='all': Sents2Ext.append(chunkprocess(Chunk,ReturnRaw)) else: if Cntr in LineNums: LineNums.remove(Cntr) Sents2Ext.append(chunkprocess(Chunk,ReturnRaw)) FSr,Chunk,_,NxtLine=extract_chunk(FSr) if not LineNums or not NxtLine: Sentl=True if Print: print(Sents2Ext) return Sents2Ext
def mark_sents(FP,FtCnts,Recover=True,Output=None): #set_trace() ''' def find_eof_errors(FP): FSr=open(FP) LstLiNe=myModule.readline_reverse(FSr) TrailEmptyLineCnt=0 while LstLiNe.strip()=='': LstLine=myModule.readline_reverse(FSr) TrailEmptyLineCnt+=1 if LstLine!='EOS': LstEOSP=False return (TrailEmptyCnt,LstEOSP) ''' def mark_errors_sentlines(SentLines,FtCnts,SentCnt,FstLineNum,Recover=True): MkdLines=[] for (Cntr,Line) in enumerate(SentLines): Wrong=something_wrong_insideline(Line,FtCnts) if not Wrong: MkdLines.append((Line,Line,'original')) # below is when there is something wrong!!! else: if Recover: print('error found ('+Wrong+' at '+str(SentCnt)+'/'+str(FstLineNum+Cntr+1)+'), attempting to recover') # attempt to recover Attempted=try_and_recover(Line,Wrong) # it could return none, this is failure if Attempted is None: print('recovery failed') MkdLines.append((Line,None,Wrong)) # it could return something where there still are errors elif something_wrong_insideline(Attempted,FtCnts): print('recovery failed') MkdLines.append((Line,None,Wrong)) # otherwise it's success else: print('recovery successful') MkdLines.append((Line,Attempted,'recovered')) else: MkdLines.append((Line,None,Wrong)) return MkdLines with open(FP,'rt') as FSr: # (TrailEmptyCnt,LstEOSP)=find_eof_errors(FP) # if not Recover and not (TrailEmptyCnt and LstEOSP): # sys.exit('there is an EOF error, either empty trailing lines or no EOS') extract_chunk=lambda FSr: myModule.pop_chunk_from_stream(FSr,Pattern='EOS') MkdSents=[]; SentCnt=LineCnt=0; NextLine=True while NextLine: FSr,Sent,LineCntPerSent,NextLine=extract_chunk(FSr) if NextLine: LineCnt+=LineCntPerSent;SentCnt+=1 if Sent.strip()=='': if Recover: MkdSents.append([(Sent,None,'empty sent')]) else: MkdLines=mark_errors_sentlines(Sent.strip().split('\n'),FtCnts,SentCnt,LineCnt,Recover=Recover) MkdSents.append(MkdLines) return MkdSents