Ejemplo n.º 1
0
    def buildTransitionMatrix(self, tagged_corpus: list, train_size):
        train = tagged_corpus[:int(train_size * len(tagged_corpus))]
        random.shuffle(train)
        #construction of the transition matrix
        transition = ConditionalFreqDist()
        for (tag1, tag2) in train:

            if tag1 not in transition:
                transition[tag1] = FreqDist()
            if tag2 not in transition[tag1]:
                transition[tag1][tag2] = 0.0

            transition[tag1][tag2] += 1

        for tag in transition.keys():
            somme = 0.0
            for value in transition[tag].values():
                somme += value
            for successor in transition[tag].keys():
                transition[tag][successor] = round(
                    float("{0:.6f}".format(transition[tag][successor] /
                                           somme)), 6)

        self.TRANSITION_MATRIX = transition
        return transition
#pickle.dump( docfreqs, open( 'docfreqs.p','w' ) ) #apparently this doesn't work because docfreqs is honkin' big


def idf(w):
    return (log(len(bnc.fileids()) + 1) - log(docfreqs[w].B())
            )  # docfreqs[w].B() is how many docs word occurs in


def tf_idf(w):
    return docfreqs[w].N() * idf(
        w)  #docfreqs[w].N() is how often word occurs throughout entire BNC


wordlist = [
    w for w in sorted(
        docfreqs.keys(), key=lambda x: docfreqs[x].N(), reverse=True)
    if w not in stopset  # comment this out if want to include stops
    if docfreqs[w].N() > 2
]
r2i = dict((w, i) for (i, w) in enumerate(wordlist[:ROWS]))
c2i = dict((w, i) for (i, w) in enumerate(wordlist[50:COLS + 50])
           )  # leave out the 50 most frequent words from the context columns
#pickle.dump( r2i, open( 'r2iWithoutStops.p','w' ) )
#pickle.dump( c2i, open( 'c2iWithoutstops.p','w' ) )
#pickle.dump( r2i, open( 'r2iWithStops.p','w' ) )
#pickle.dump( c2i, open( 'c2iWithStops.p','w' ) )
#pickle.dump( r2i, open( 'r2iWithStopsNotLemmatized.p','w' ) )
#pickle.dump( c2i, open( 'c2iWithStopsNotLemmatized.p','w' ) )
#pickle.dump( r2i, open( 'r2iWithoutStopsNotLemmatized.p','w' ) )
#pickle.dump( c2i, open( 'c2iWithoutStopsNotLemmatized.p','w' ) )
pickle.dump(r2i, open('r2iWithoutStopsLemmatized.p', 'w'))