Ejemplo n.º 1
0
def findRelatedGrams():

    klog.msg('find co-exist terms')
    Hist = {}

    for idx_raw, text in enumerate(ieeePapers()):

        sentences = lang.getSentenceList(text)

        for idofs, s in enumerate(sentences):

            grams = ngramsOfSentence(s)
            kbay.countCooccurConsideringSubgram(grams,
                                                Hist,
                                                gramDF=GRAM_DF,
                                                debug=0)

        if idx_raw % 1000 == 0:
            mem.refreshMemory2D(Hist, steps=idx_raw, prefilter=True)
            h, hh = len(Hist), kbay.sizeOf2DHist(Hist)
            print "hist size", h, hh, hh / h

        peek(idx_raw + 1, 1000)
        if idx_raw > 200000: break

    kbay.filter2DHistByCount(Hist, 2, verbose=True)
    kbay.saveDF2D(Hist, 'tmp_ieee_occur_all.txt')
Ejemplo n.º 2
0
def coocurWordFirstOrder():

    klog.msg('find word coocur')
    Hist = {}
    nNumSentences = 0
    for nFiles, text in enumerate(textFiles()):
        sentences = lang.getSentenceList(text)
        nNumSentences += len(sentences)
        if nFiles > 100000: break
        if nFiles % 1000 == 1:
            print nFiles, " files: #.sentences", nNumSentences
        continue
        for ids, s in enumerate(sentences):
            tokenlist = lang.tokenize(s)
            tokenlist = lang.regularwords(tokenlist)
            selected = [t for t in tokenlist if not lang.isNoise(t)]
            #kbay.countCooccur(selected, Hist)
            kbay.countCooccurNearest(selected, Hist, nearest=10)

        if nFiles % 1000 == 1:
            refreshMemory2D(Hist, steps=nFiles, prefilter=True)
            h, hh = len(Hist), kbay.sizeOf2DHist(Hist)
            print "hist size", h, hh, hh / h
        peek(nFiles + 1, 1000)
        if nFiles > 100000: break
    print "number of sentences:", nNumSentences
    return
    klog.msg("finished total %s files" % nFiles)
    refreshMemory2D(Hist, steps=nFiles, prefilter=True)
    kbay.filter2DHistByCount(Hist, 3, verbose=True)
    kbay.saveDF2D(Hist, SAVEDIR + SrcFileName + '_word_coocur.txt')
Ejemplo n.º 3
0
def coocurWord(save2file, verbose=0):
    klog.msg('find word coocur')
    Hist = {}
    for nFiles, (datafile, text) in enumerate(loopWikiData()):
        sentences = cleaner.getSentences_regex(text)
        for s in sentences:
            #selected = set(cleaner.capitalizedWordsOf(s))
            selected = cleaner.tokenize_simple(s)
            if verbose or 0:
                print "sentence:", s
                print "selected:", selected
                pause()
            #kbay.countCooccurNearest(selected, Hist, nearest=10)
            kbay.countCooccurNearest(selected, Hist, nearest=2)
            #print "learn", selected
            #print Hist, len(Hist)
            #pause()
        if nFiles % 1000 == 0: print nFiles, " files done"
        if nFiles % 4000 == 0:
            print "before mem refresh:", len(Hist)
            memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
            print "after mem refresh:", len(Hist), '\n'
        if nFiles > 40000: break

    klog.msg("finished total %s files" % nFiles)
    memoryRefresh.refreshMemory2D(Hist, steps=nFiles, prefilter=True)
    kbay.filter2DHistByCount(Hist, 3, verbose=True)

    fif.saveHist2D(Hist, save2file)

    return Hist
Ejemplo n.º 4
0
def findNgrams(DO_COOCUR=False, ROUND=1):

    NGRAMS_DOC = {}
    COOCUR_S = {}  # co-occurrence in a sentence
    NGRAM_LR = {}

    for idx, text in enumerate(textFiles()):
        peek(idx + 1, 1000)

        #if idx>1000: break

        ngram_local = {}
        sentences = lang.getSentenceList(text)

        for idofs, s in enumerate(sentences):

            tokenlist = lang.tokenize(s)
            poslist = lang.posnize(tokenlist)

            #print "\n-----tokens and poslist----"
            #print ["(%s, %s)"%(t, poslist[i]) for i, t in enumerate(tokenlist)]

            if not len(tokenlist) > 5:
                #print "Anormaly of sentence:", s
                #pause()
                continue

            tokenstoplist = lang.markStops(tokenlist)

            if 0:
                print "stops:"
                print tokenstoplist
                #pause()

            if len(tokenlist) > 80:
                continue
                print "###### text ######", idx
                print text
                print tokenlist, len(tokenlist)
                #pause()

            ngb = lang.ngrambounds(tokenstoplist)
            #print "gram with bounds:", ngb

            selecedngb = lang.filterSRS(ngb, tokenstoplist)
            #print "\nSRS-FIL gram with bounds:", selecedngb
            selecedngb = lang.filterAdj(selecedngb, s)
            #print "\nADJ-FIL gram with bounds:", selecedngb
            selecedngb = lang.filterAdv(selecedngb, s)
            #print "\nADV-FIL gram with bounds:", selecedngb
            #selecedngb = lang.filterVerb(selecedngb, s, verbose=0) #<--- "contrast", "field" incorrectly ignored
            #print "\nVERB-FIL gram with bounds:", selecedngb

            # do it again after pure pos-based filtering
            selecedngb = lang.filterSRS(selecedngb, tokenstoplist)
            #print "\nFINAL selected gram with bounds:", selecedngb

            if ROUND == 1:
                # in the 1st round, profile the next word after a gram
                for (gram, leftidx, rightidx) in selecedngb:
                    nextword = lang.nextword(rightidx, tokenlist)
                    prevword = lang.prevword(leftidx, tokenlist)
                    nextwordcode = lang.profilingCode(nextword)
                    prevwordcode = lang.profilingCode(prevword)

                    kbay.inc3d(gram, '_', '_', NGRAM_LR)  # '_' as itself
                    kbay.inc3d(gram, 'l', prevwordcode, NGRAM_LR)
                    kbay.inc3d(gram, 'r', nextwordcode, NGRAM_LR)

                    if lang.isSubject(leftidx, rightidx, tokenlist):
                        kbay.inc3d(gram, '_', 's', NGRAM_LR)
                        #print "subject:", gram
                        #pause()

            if ROUND == 2:
                # in the 2nd round, justify the gram
                for ngb in selecedngb:
                    print "check this:", ngb
                    sg = grammer.subgram(ngb[0], ngb[1], ngb[2],
                                         READIN_GRAM_LR, tokenlist, poslist)
                    if sg:
                        print "gram", ngb, "subgram", sg
                        raw_input()

            if 0:
                print "\n\n", s
                print "raw   ngb >", ngb
                print "final ngb >", selecedngb
                pause()

            ngrams = [t[0] for t in selecedngb]
            ngrams = [g for g in ngrams if len(g.split()) > 1]

            kbay.count(ngrams, ngram_local)

            if DO_COOCUR:
                for n1 in ngrams:
                    for n2 in ngrams:
                        kbay.inc2d(n1, n2, COOCUR_S)

        # doc.freq. - each gram counted only once
        kbay.count(ngram_local, NGRAMS_DOC)

    kbay.saveHist3D(NGRAM_LR, SAVEDIR + 'hist.txt')

    #print "filter df-doc"
    #filterHistByCount(NGRAMS_DOC, 2, verbose=False)
    #kbay.saveDF(NGRAMS_DOC,   SAVEDIR+SrcFileName+'_ngrams_df_doc.txt', sort=False, numDoc=idx)

    if DO_COOCUR:
        print "filter coocur"
        kbay.filter2DHistByCount(COOCUR_S, 2, verbose=True)
        kbay.saveDF2D(COOCUR_S, SAVEDIR + SrcFileName + '_ngrams_coocur.txt')

    print "DONE findNgrams"