Example #1
0
def produce_traintest(OrgFP,TestSpec,CheckAgainst=None):
    (WhereFrom,TestNum,PercentP)=TestSpec
    SentCnt=count_sentences(OrgFP)
    if PercentP:
        WhereFrom=int(SentCnt//(100/WhereFrom))
        TestNum=int(SentCnt//(100/TestNum))
    if CheckAgainst:
        SentsAlreadyInTest=open(CheckAgainst).read().strip().split('\n')
    FSwTest=open(myModule.get_stem_ext(OrgFP)[0]+'_test.mecab','wt')
    FSwTrain=open(myModule.get_stem_ext(OrgFP)[0]+'_train.mecab','wt')
    TestCntr=0    
    for Cntr,Sent in enumerate(extract_sentences(OrgFP)):
        #AlreadyInTestP=False
        if CheckAgainst:
            SentStr=''.join([ Line.split('\t')[0] for Line in Sent ])
            if already_in_anothersentlist_p(SentStr,SentsAlreadyInTest):
                TestCntr+=1
                continue
        if Cntr+1>=WhereFrom and TestCntr<TestNum:
            TestCntr+=1
            FSwToWrite=FSwTest
        else:
            FSwToWrite=FSwTrain
        FSwToWrite.write('\n'.join(Sent)+'\nEOS\n')
    FSwTest.close()
    FSwTrain.close()
Example #2
0
def generate_ftchunk(MecabDicFP, FtInds, Out=sys.stdout):
    SortedDicFP = myModule.get_stem_ext(
        MecabDicFP.replace('rawData', 'processedData'))[0] + '.sorted.csv'
    sort_mecabdic_fts(MecabDicFP, FtInds, OutFP=SortedDicFP)
    FSr = open(SortedDicFP)
    Lines = []
    PrvRelvFts = None
    FstLoop = True
    for LiNe in FSr:
        if not FstLoop:
            Line = LiNe.strip()
            LineEls = Line.split(',')
            RelvFts = [LineEls[Ind] for Ind in FtInds]
            if PrvRelvFts != RelvFts:
                yield Lines
                Lines = []
            else:
                Lines.append(Line)
                PrvRelvFts = RelvFts
        else:
            FstLoop = not FstLoop
Example #3
0
def markedsents2outputs(MkdSents,OrgFP,StrictP=True,MoveTo=None):
    ErrorOutput=OrgFP+'.errors'
    ReducedOutput=myModule.get_stem_ext(OrgFP)[0]+'.reduced.mecab'
    FSwE=open(ErrorOutput,'wt')
    FSwR=open(ReducedOutput,'wt')
    ErrorCnt=0; LineCntr=0
    for Cntr,MkdSent in enumerate(MkdSents):
        LineCntr+=len(MkdSent)+1
        if not all(Line[-1]=='original' for Line in MkdSent):
            if StrictP or any(Line[-2] is None for Line in  MkdSent):
                ErrorCnt+=1
                FSwE.write(str(Cntr+1)+'; '+str(LineCntr)+'\n'+'\n'.join([ MkdLine[0]+'\t'+MkdLine[-1] for MkdLine in MkdSent])+'\n')
            else:
                FSwR.write(markedsent2output(MkdSent))
        else:
            MkdSentM=markedsent2output(MkdSent)
            FSwR.write(MkdSentM)
    FSwE.close()
    FSwR.close()
    if ErrorCnt==0:
        os.remove(ReducedOutput)
        os.remove(ErrorOutput)
        print('No error found for file '+OrgFP)
        time.sleep(2)
        return True
    else:
        print(str(ErrorCnt)+' error(s) found for file '+OrgFP)
        if not MoveTo:
            MoveTo=os.getcwd
        subprocess.call(['cp',OrgFP,MoveTo])
        subprocess.call(['cp',ErrorOutput,MoveTo])
        os.remove(OrgFP)
        os.remove(ErrorOutput)
        print('Original file moved to '+MoveTo)
        time.sleep(2)
        return False
Example #4
0
def main0(MecabFP,
          CorpusOrDic='dic',
          OutFP=None,
          Debug=0,
          Fts=None,
          UnkAbsFtCnt=2,
          StrictP=False,
          OrgReduced=True):
    NewWds = set()
    if OutFP is True:
        Stem, Ext = myModule.get_stem_ext(MecabFP)
        Out = open(Stem + '.compressed.' + Ext, 'wt')
    elif OutFP is None or OutFP is False:
        Out = sys.stdout
    else:
        Out = open(OutFP + '.tmp', 'wt')
    if OrgReduced:
        OrgReducedFSw = open(OutFP + '.orgreduced', 'wt')

    ChunkGen = generate_chunks(MecabFP, CorpusOrDic)
    print('\nCompressing ' + MecabFP + '\n')
    ErrorStrs = []
    for Cntr, SentChunk in enumerate(ChunkGen):
        if not SentChunk:
            if Debug:
                sys.stderr.write('\nsent ' + str(Cntr + 1) + ' is empty\n')
            continue
        if Debug:
            sys.stderr.write(
                '\nsent ' + str(Cntr + 1) + ' ' +
                ''.join([Sent.split('\t')[0] for Sent in SentChunk]) + '\n')
        SuccessP, NewLines = lemmatise_mecabchunk(SentChunk,
                                                  CorpusOrDic,
                                                  NewWds,
                                                  OutFP,
                                                  Debug=Debug,
                                                  Fts=Fts,
                                                  UnkAbsFtCnt=UnkAbsFtCnt)
        if SuccessP:
            Out.write('\n'.join(NewLines + ['EOS']) + '\n')
            if OrgReduced:
                OrgReducedFSw.write('EOS\n'.join(SentChunk) + '\n')
        else:
            if StrictP:
                lemmatise_mecabchunk(SentChunk,
                                     CorpusOrDic,
                                     NewWds,
                                     OutFP,
                                     Debug=2,
                                     Fts=Fts)
            else:
                FailedNth = len(NewLines)
                if len(NewLines) == 1:
                    MiddlePhr = '(the first word failed)'
                else:
                    MiddlePhr = '(starting with the word ' + NewLines[0].split(
                    )[0] + ')'

                ErrorStr = 'Sentence ' + str(
                    Cntr + 1) + ' ' + MiddlePhr + ' failed on its ' + str(
                        FailedNth) + 'th line:\n' + NewLines[-1].get_mecabline(
                        )
                sys.stderr.write('\n' + ErrorStr + '\n')
                ErrorStrs.append(ErrorStr)

                lemmatise_mecabchunk(SentChunk,
                                     CorpusOrDic,
                                     NewWds,
                                     OutFP,
                                     Debug=2,
                                     Fts=Fts)

    print('\ncompression for ' + MecabFP + ' ended')
    if OutFP:
        Out.close()
        os.rename(OutFP + '.tmp', OutFP)
        print('  Output file: ' + OutFP + '')

        if ErrorStrs:
            ErrorFP = OutFP + '.errors'
            print('  Error(s) found, error count ' + str(len(ErrorStrs)) +
                  ' out of ' + str(Cntr + 1) + ' sentences. For details see ' +
                  ErrorFP + '\n')
            time.sleep(2)
            ErrorOut = open(ErrorFP, 'wt')

            ErrorOut.write('\n'.join(ErrorStrs))
        else:
            print('  No errors, congrats!\n')