コード例 #1
0
ファイル: externalvectors.py プロジェクト: afcarl/textylon
def loadExternalRepresentationRepLab(textFile, encoding='utf-8', dimension=100):
    """
    this function loads an external representation IndVoc and CoocMatrix
    the format of the file like this:
    first line is
    vocabulary_count dimensionality
    the other lines are of the follwoin format
    word value value .... value
    and the number of values is the dimensionality in the first line
    """
    IndVoc = Vocabulary()
    CoocMat = Matrix()
    wordID = 0
    with codecs.open(textFile, 'r', encoding) as f:
        for line in f:
            fields = line.split('\t')
            word = fields[0]
            IndVoc.set(word, wordID)
            wordID += 1
    
    wordID = 0       
    with codecs.open(textFile, 'r', encoding) as f:
        for line in f:
            fields = line.split('\t')
            word = fields[0]
            vector = map(float, fields[1].split(','))
            wordID += 1
        
    # the first line is the vocabulary size and the representation dimensionality
    lines = f.readlines()
    theFirstTime = True
    reprDict = {}
    vocabSize = 0
    dimensionality = 0
    matrix = None
    CoocMat.makematrix(vocabSize, dimensionality)
    numRows = 0
    for line in lines:
        numRows += 1
        if theFirstTime:
            theFirstTime = False
            fs = line.split()
            vocabSize = int(fs[0])
            dimensionality = int(fs[1])
            continue
        fields = line.split()
        word = fields[0]
        l = fields[1:] 
        vector = np.array(map(float, l))
        if numRows == 2:
            matrix = vector
        else:
            matrix = np.vstack((matrix, vector))
        IndVoc.set(word, IndVoc.getlength())
        
    CoocMat.makeMatrixFromDense(matrix)
    return IndVoc, CoocMat
コード例 #2
0
def loadExternalRepresentation(textFile):
    """
    this function loads an external representation IndVoc and CoocMatrix
    the format of the file like this:
    first line is
    vocabulary_count dimensionality
    the other lines are of the follwoin format
    word value value .... value
    and the number of values is the dimensionality in the first line
    """
    global IndVoc
    IndVoc = Vocabulary()
    global CtxVoc
    CtxVoc = Vocabulary()
    global CoocMat
    CoocMat = Matrix()

    f = open(textFile, 'r')
    # the first line is the vocabulary size and the representation dimensionality
    lines = f.readlines()
    theFirstTime = True
    reprDict = {}
    vocabSize = 0
    dimensionality = 0
    for line in lines:
        if theFirstTime:
            theFirstTime = False
            fs = line.split()
            vocabSize = int(fs[0])
            dimensionality = int(fs[1])
            CoocMat.makematrix(vocabSize, dimensionality)
            continue
        fields = line.split()
        word = fields[0]
        l = fields[1:]
        vector = np.array(map(float, l))
        i = 0
        for v in np.nditer(vector):
            CoocMat.update(IndVoc.getlength(), i, v)
        IndVoc.set(word, IndVoc.getlength())
コード例 #3
0
ファイル: dsm-orig.py プロジェクト: afshinrahimi/textylon
def loadExternalRepresentation(textFile):
    """
    this function loads an external representation IndVoc and CoocMatrix
    the format of the file like this:
    first line is
    vocabulary_count dimensionality
    the other lines are of the follwoin format
    word value value .... value
    and the number of values is the dimensionality in the first line
    """
    global IndVoc
    IndVoc = Vocabulary()
    global CtxVoc
    CtxVoc = Vocabulary()
    global CoocMat
    CoocMat = Matrix()
    
    f = open(textFile, 'r')
    # the first line is the vocabulary size and the representation dimensionality
    lines = f.readlines()
    theFirstTime = True
    reprDict = {}
    vocabSize = 0
    dimensionality = 0
    for line in lines:
        if theFirstTime:
            theFirstTime = False
            fs = line.split()
            vocabSize = int(fs[0])
            dimensionality = int(fs[1])
            CoocMat.makematrix(vocabSize, dimensionality)
            continue
        fields = line.split()
        word = fields[0]
        l = fields[1:] 
        vector = np.array(map(float, l))
        i = 0
        for v in np.nditer(vector):
            CoocMat.update(IndVoc.getlength(), i, v)
        IndVoc.set(word, IndVoc.getlength())
コード例 #4
0
ファイル: dsm.py プロジェクト: afcarl/textylon
class DSM(object):
    def __init__(self, encoding='utf-8'):
        self.Words = Counter()
        self.IndVoc = Vocabulary()
        self.CtxVoc = Vocabulary()
        self.CoocMat = Matrix()
        self.numLines = 0
        self.encoding = encoding

    # collect vocabulary and count frequencies
    def count_freqs(self, infile):

        print "Started: " + strftime("%H:%M:%S", gmtime())
        with codecs.open(infile, "r", self.encoding) as inp:
            for line in inp:
                self.numLines += 1
                for wrd in line.split():
                    self.Words[wrd] += 1
        print "Finished: " + strftime("%H:%M:%S", gmtime())
        print "Token count: " + str(sum(self.Words.values()))

    # count cooccurrence frequencies from infile within win
    # frequency threhsolds for both index words and context words
    # dsm = distributional semantic model
    def dsm(self, infile, win, index_minf, index_maxf, ctx_minf, ctx_maxf):

        print "Started: " + strftime("%H:%M:%S", gmtime())
        with codecs.open(infile, "r", self.encoding) as inp:
            self.update_vocabulary(index_minf, index_maxf, ctx_minf, ctx_maxf)
            # print self.IndVoc.getlength(), self.CtxVoc.getlength()
            self.CoocMat.makematrix(self.IndVoc.getlength(),
                                    self.CtxVoc.getlength())
            line_nr = 0
            for line in inp:
                cnt = 0
                wrdlst = line.split()
                for wrd in wrdlst:
                    if self.IndVoc.lookup(wrd):
                        # count co-occurrences to the left
                        ctx = 1
                        while ctx <= win:
                            if (cnt - ctx) >= 0:
                                c = wrdlst[cnt - ctx]
                                self.update_counts(c, wrd, ctx_minf, ctx_maxf)
                                ctx += 1
                            else:
                                ctx = win + 1
                # count co-occurrences to the right
                        ctx = 1
                        while ctx <= win:
                            if (cnt + ctx) < len(wrdlst):
                                c = wrdlst[cnt + ctx]
                                self.update_counts(c, wrd, ctx_minf, ctx_maxf)
                                ctx += 1
                            else:
                                ctx = win + 1
                    cnt += 1
                line_nr += 1
        print "Finished: " + strftime("%H:%M:%S", gmtime())

    # check if the word should be indexed and used as ctx word
    def update_vocabulary(self, index_minf, index_maxf, ctx_minf, ctx_maxf):
        i_cnt = 0
        c_cnt = 0
        for w in self.Words.most_common():
            q = w[1]
            if (q > index_minf) and (q < index_maxf):
                self.IndVoc.set(w[0], i_cnt)
                i_cnt += 1
            if (q > ctx_minf) and (q < ctx_maxf):
                self.CtxVoc.set(w[0], c_cnt)
                c_cnt += 1

    # update cooccurrence counts
    def update_counts(self, w, wrd, minf, maxf):
        if self.CtxVoc.lookup(w):
            self.CoocMat.update(self.IndVoc.getindex(wrd),
                                self.CtxVoc.getindex(w), 1)

    # TODO: implement direction-sensitive dsm (aka HAL)

    ######
    # Misc
    ######

    # clean up vocabularies and co-occurrence matrix
    def clear_ctx(self):
        self.IndVoc.delete()
        self.CtxVoc.delete()
        self.CoocMat.delete()

    # clean up frequency counters
    def clear_freq(self):
        self.Words.clear()

    ############
    # Evaluation
    ############

    # toefl test
    def toefl(self, testfile):
        inp = open(testfile, "r")
        corr = 0
        tot = 0
        unknown_target = []
        unknown_answer = []
        incorrect = []
        for line in inp.readlines():
            flag = False
            target, correct, alt2, alt3, alt4 = line.replace("(", "").split()
            if self.IndVoc.lookup(target):
                targetvec = self.CoocMat.matrix.getrow(
                    self.IndVoc.getindex(target)).todense()
                tot += 1
                if self.IndVoc.lookup(correct):
                    correctvec = self.CoocMat.matrix.getrow(
                        self.IndVoc.getindex(correct)).todense()
                    sim = 1 - sp.distance.cosine(targetvec, correctvec)
                    if sim > 0.0:
                        flag = True
                    else:
                        incorrect.append(target)
                    for i in (alt2, alt3, alt4):
                        if self.IndVoc.lookup(i):
                            i_vec = self.CoocMat.matrix.getrow(
                                self.IndVoc.getindex(i)).todense()
                            i_sim = 1 - sp.distance.cosine(targetvec, i_vec)
                            if i_sim > sim:
                                if not target in incorrect:
                                    incorrect.append(target)
                                flag = False
                    if flag:
                        corr += 1
                else:
                    unknown_answer.append(correct)
            else:
                unknown_target.append(target)
        inp.close()
        print "TOEFL synonym score: " + str(
            float(corr) / float(tot)) + " (" + str(corr) + "/" + str(tot) + ")"
        print "Incorrect: " + str(incorrect)
        print "Unknown targets: " + str(unknown_target)
        print "Unknown answers: " + str(unknown_answer)

        logger.info("TOEFL synonym score: " + str(float(corr) / float(tot)) +
                    " (" + str(corr) + "/" + str(tot) + ")")
        logger.info("Incorrect: " + str(incorrect))
        logger.info("Unknown targets: " + str(unknown_target))
        logger.info("Unknown answers: " + str(unknown_answer))

    # toefl test
    # A is a numpy matrix
    def toefl_mat(self, testfile, A):
        inp = open(testfile, "r")
        flag = False
        corr = 0
        tot = 0
        unknown_target = []
        unknown_answer = []
        incorrect = []
        for line in inp.readlines():
            target, correct, alt2, alt3, alt4 = line.replace("(", "").split()
            if self.IndVoc.lookup(target):
                targetvec = A[self.IndVoc.getindex(target), :]
                tot += 1
                if self.IndVoc.lookup(correct):
                    correctvec = A[self.IndVoc.getindex(correct), :]
                    sim = 1 - sp.distance.cosine(targetvec, correctvec)
                    if sim > 0.0:
                        flag = True
                    for i in (alt2, alt3, alt4):
                        if self.IndVoc.lookup(i):
                            i_vec = A[self.IndVoc.getindex(i), :]
                            i_sim = 1 - sp.distance.cosine(targetvec, i_vec)
                            if i_sim > sim:
                                if not target in incorrect:
                                    incorrect.append(target)
                                flag = False
                    if flag:
                        corr += 1
                else:
                    unknown_answer.append(correct)
            else:
                unknown_target.append(target)
        inp.close()
        print "TOEFL synonym score: " + str(float(corr) / float(tot))
        print "Incorrect: " + str(incorrect)
        print "Unknown targets: " + str(unknown_target)
        print "Unknown answers: " + str(unknown_answer)
        logger.info("TOEFL synonym score: " + str(float(corr) / float(tot)) +
                    " (" + str(corr) + "/" + str(tot) + ")")
        logger.info("Incorrect: " + str(incorrect))
        logger.info("Unknown targets: " + str(unknown_target))
        logger.info("Unknown answers: " + str(unknown_answer))

    # find the nr nearest neighbors to word using cosine similarity
    # TODO: optimization
    def nns(self, word, nr):
        res = {}
        if self.IndVoc.lookup(word):
            w_vec = self.CoocMat.matrix.getrow(
                self.IndVoc.getindex(word)).todense()
            for k in self.IndVoc.hsh:
                k_vec = self.CoocMat.matrix.getrow(
                    self.IndVoc.getindex(k)).todense()
                sim = 1 - sp.distance.cosine(w_vec, k_vec)
                if (not math.isnan(sim)) and (not math.isinf(sim)):
                    res[k] = sim
        sorted_res = sorted(res.iteritems(),
                            key=lambda (k, v): v,
                            reverse=True)
        # print word, sorted_res[0:nr]
        return sorted_res[0:nr]
        # for r in sorted_res[1:nr]: # 1 to avoid including word
        # print r[0] + ' ' + str(r[1][0][0])
    def loadExternalRepresentation(self, textFile):
        """
        this function loads an external representation self.IndVoc and CoocMatrix
        the format of the file like this:
        first line is
        vocabulary_count dimensionality
        the other lines are of the follwoin format
        word value value .... value
        and the number of values is the dimensionality in the first line
        """

        f = open(textFile, 'r')
        # the first line is the vocabulary size and the representation dimensionality
        lines = f.readlines()
        theFirstTime = True
        reprDict = {}
        vocabSize = 0
        dimensionality = 0
        for line in lines:
            if theFirstTime:
                theFirstTime = False
                fs = line.split()
                vocabSize = int(fs[0])
                dimensionality = int(fs[1])
                self.CoocMat.makematrix(vocabSize, dimensionality)
                continue
            fields = line.split()
            word = fields[0]
            l = fields[1:]
            vector = np.array(map(float, l))
            i = 0
            for v in np.nditer(vector):
                self.CoocMat.update(self.IndVoc.getlength(), i, v)
            self.IndVoc.set(word, self.IndVoc.getlength())

    def dumpVocabAndCoocMatrix(self):
        print "start pickling dsm model..." + strftime("%H:%M:%S", gmtime())
        pickle.dump((self.IndVoc, self.CoocMat, self.CtxVoc),
                    open("dsm.pkl", "wb"))
        print "Finished: " + strftime("%H:%M:%S", gmtime())

    def loadVocabAndCoocMatrix(self):
        print "start depickling dsm model..." + strftime("%H:%M:%S", gmtime())
        self.IndVoc, self.CoocMat, self.CtxVoc = pickle.load(
            open("dsm.pkl", "rb"))
        print "Finished: " + strftime("%H:%M:%S", gmtime())

    def tfidf(self):
        transformer = TfidfTransformer()
        self.CoocMat.matrix = transformer.fit_transform(self.CoocMat.matrix)
コード例 #5
0
ファイル: dsm.py プロジェクト: afshinrahimi/textylon
class DSM(object):
    
    def __init__(self, encoding='utf-8'):
        self.Words = Counter()
        self.IndVoc = Vocabulary()
        self.CtxVoc = Vocabulary()
        self.CoocMat = Matrix()
        self.numLines = 0
        self.encoding = encoding
    # collect vocabulary and count frequencies
    def count_freqs(self, infile):

        print "Started: " + strftime("%H:%M:%S", gmtime())
        with codecs.open(infile, "r", self.encoding) as inp:
            for line in inp:
                self.numLines += 1
                for wrd in line.split():
                    self.Words[wrd] += 1
        print "Finished: " + strftime("%H:%M:%S", gmtime())
        print "Token count: " + str(sum(self.Words.values()))
    
    # count cooccurrence frequencies from infile within win
    # frequency threhsolds for both index words and context words
    # dsm = distributional semantic model
    def dsm(self, infile, win, index_minf, index_maxf, ctx_minf, ctx_maxf):



        print "Started: " + strftime("%H:%M:%S", gmtime())
        with codecs.open(infile, "r", self.encoding) as inp:
            self.update_vocabulary(index_minf, index_maxf, ctx_minf, ctx_maxf)
            # print self.IndVoc.getlength(), self.CtxVoc.getlength()
            self.CoocMat.makematrix(self.IndVoc.getlength(), self.CtxVoc.getlength())
            line_nr = 0
            for line in inp:
                cnt = 0
                wrdlst = line.split()
                for wrd in wrdlst:
                    if self.IndVoc.lookup(wrd):
                    # count co-occurrences to the left
                        ctx = 1
                        while ctx <= win:
                            if (cnt - ctx) >= 0:
                                c = wrdlst[cnt - ctx]
                                self.update_counts(c, wrd, ctx_minf, ctx_maxf)
                                ctx += 1
                            else:
                                ctx = win + 1
                   # count co-occurrences to the right
                        ctx = 1
                        while ctx <= win:
                            if (cnt + ctx) < len(wrdlst):
                                c = wrdlst[cnt + ctx]
                                self.update_counts(c, wrd, ctx_minf, ctx_maxf)
                                ctx += 1
                            else:
                                ctx = win + 1
                    cnt += 1
                line_nr += 1
        print "Finished: " + strftime("%H:%M:%S", gmtime())
    
    # check if the word should be indexed and used as ctx word
    def update_vocabulary(self, index_minf, index_maxf, ctx_minf, ctx_maxf):
        i_cnt = 0
        c_cnt = 0
        for w in self.Words.most_common():
            q = w[1]
            if (q > index_minf) and (q < index_maxf):
                self.IndVoc.set(w[0], i_cnt)
                i_cnt += 1
            if (q > ctx_minf) and (q < ctx_maxf):
                self.CtxVoc.set(w[0], c_cnt)
                c_cnt += 1
    
    # update cooccurrence counts
    def update_counts(self, w, wrd, minf, maxf):
        if self.CtxVoc.lookup(w):
            self.CoocMat.update(self.IndVoc.getindex(wrd), self.CtxVoc.getindex(w), 1)
    
    # TODO: implement direction-sensitive dsm (aka HAL)
    
    ######
    # Misc
    ######
    
    # clean up vocabularies and co-occurrence matrix
    def clear_ctx(self):
        self.IndVoc.delete()
        self.CtxVoc.delete()
        self.CoocMat.delete()
    
    # clean up frequency counters
    def clear_freq(self):
        self.Words.clear()
    
    ############
    # Evaluation
    ############
    
    # toefl test
    def toefl(self, testfile):
        inp = open(testfile, "r")
        corr = 0
        tot = 0
        unknown_target = []
        unknown_answer = []
        incorrect = []
        for line in inp.readlines():
            flag = False
            target, correct, alt2, alt3, alt4 = line.replace("(", "").split()
            if self.IndVoc.lookup(target):
                targetvec = self.CoocMat.matrix.getrow(self.IndVoc.getindex(target)).todense()
                tot += 1
                if self.IndVoc.lookup(correct):
                    correctvec = self.CoocMat.matrix.getrow(self.IndVoc.getindex(correct)).todense()
                    sim = 1 - sp.distance.cosine(targetvec, correctvec)
                    if sim > 0.0:
                        flag = True
                    else:
                        incorrect.append(target)
                    for i in (alt2, alt3, alt4):
                        if self.IndVoc.lookup(i):
                            i_vec = self.CoocMat.matrix.getrow(self.IndVoc.getindex(i)).todense()
                            i_sim = 1 - sp.distance.cosine(targetvec, i_vec)
                            if i_sim > sim:
                                if not target in incorrect:
                                    incorrect.append(target)
                                flag = False
                    if flag:
                        corr += 1
                else:
                    unknown_answer.append(correct)
            else:
                unknown_target.append(target)
        inp.close()
        print "TOEFL synonym score: " + str(float(corr) / float(tot)) + " (" + str(corr) + "/" + str(tot) + ")"
        print "Incorrect: " + str(incorrect)
        print "Unknown targets: " + str(unknown_target)
        print "Unknown answers: " + str(unknown_answer)
        
        logger.info("TOEFL synonym score: " + str(float(corr) / float(tot)) + " (" + str(corr) + "/" + str(tot) + ")")
        logger.info("Incorrect: " + str(incorrect))
        logger.info("Unknown targets: " + str(unknown_target))
        logger.info("Unknown answers: " + str(unknown_answer))
    
    # toefl test
    # A is a numpy matrix
    def toefl_mat(self, testfile, A):
        inp = open(testfile, "r")
        flag = False
        corr = 0
        tot = 0
        unknown_target = []
        unknown_answer = []
        incorrect = []
        for line in inp.readlines():
            target, correct, alt2, alt3, alt4 = line.replace("(", "").split()
            if self.IndVoc.lookup(target):
                targetvec = A[self.IndVoc.getindex(target), :]
                tot += 1
                if self.IndVoc.lookup(correct):
                    correctvec = A[self.IndVoc.getindex(correct), :]
                    sim = 1 - sp.distance.cosine(targetvec, correctvec)
                    if sim > 0.0:
                        flag = True
                    for i in (alt2, alt3, alt4):
                        if self.IndVoc.lookup(i):
                            i_vec = A[self.IndVoc.getindex(i), :]
                            i_sim = 1 - sp.distance.cosine(targetvec, i_vec)
                            if i_sim > sim:
                                if not target in incorrect:
                                    incorrect.append(target)
                                flag = False
                    if flag:
                        corr += 1
                else:
                    unknown_answer.append(correct)
            else:
                unknown_target.append(target)
        inp.close()
        print "TOEFL synonym score: " + str(float(corr) / float(tot))
        print "Incorrect: " + str(incorrect)
        print "Unknown targets: " + str(unknown_target)
        print "Unknown answers: " + str(unknown_answer)
        logger.info("TOEFL synonym score: " + str(float(corr) / float(tot)) + " (" + str(corr) + "/" + str(tot) + ")")
        logger.info("Incorrect: " + str(incorrect))
        logger.info("Unknown targets: " + str(unknown_target))
        logger.info("Unknown answers: " + str(unknown_answer))
    
    # find the nr nearest neighbors to word using cosine similarity
    # TODO: optimization
    def nns(self, word, nr):
        res = {}
        if self.IndVoc.lookup(word):
            w_vec = self.CoocMat.matrix.getrow(self.IndVoc.getindex(word)).todense()
            for k in self.IndVoc.hsh:
                k_vec = self.CoocMat.matrix.getrow(self.IndVoc.getindex(k)).todense()
                sim = 1 - sp.distance.cosine(w_vec, k_vec)
                if (not math.isnan(sim)) and (not math.isinf(sim)):
                    res[k] = sim
        sorted_res = sorted(res.iteritems(), key=lambda(k, v): v, reverse=True)
        # print word, sorted_res[0:nr]
        return sorted_res[0:nr]
        # for r in sorted_res[1:nr]: # 1 to avoid including word
            # print r[0] + ' ' + str(r[1][0][0])
    def loadExternalRepresentation(self, textFile):
        """
        this function loads an external representation self.IndVoc and CoocMatrix
        the format of the file like this:
        first line is
        vocabulary_count dimensionality
        the other lines are of the follwoin format
        word value value .... value
        and the number of values is the dimensionality in the first line
        """


        
        f = open(textFile, 'r')
        # the first line is the vocabulary size and the representation dimensionality
        lines = f.readlines()
        theFirstTime = True
        reprDict = {}
        vocabSize = 0
        dimensionality = 0
        for line in lines:
            if theFirstTime:
                theFirstTime = False
                fs = line.split()
                vocabSize = int(fs[0])
                dimensionality = int(fs[1])
                self.CoocMat.makematrix(vocabSize, dimensionality)
                continue
            fields = line.split()
            word = fields[0]
            l = fields[1:] 
            vector = np.array(map(float, l))
            i = 0
            for v in np.nditer(vector):
                self.CoocMat.update(self.IndVoc.getlength(), i, v)
            self.IndVoc.set(word, self.IndVoc.getlength())
    def dumpVocabAndCoocMatrix(self):
        print "start pickling dsm model..." + strftime("%H:%M:%S", gmtime())
        pickle.dump((self.IndVoc, self.CoocMat, self.CtxVoc), open("dsm.pkl", "wb"))
        print "Finished: " + strftime("%H:%M:%S", gmtime())
    def loadVocabAndCoocMatrix(self):
        print "start depickling dsm model..." + strftime("%H:%M:%S", gmtime())
        self.IndVoc, self.CoocMat, self.CtxVoc = pickle.load(open("dsm.pkl", "rb"))
        print "Finished: " + strftime("%H:%M:%S", gmtime())
    def tfidf(self):
        transformer = TfidfTransformer()
        self.CoocMat.matrix = transformer.fit_transform(self.CoocMat.matrix)