コード例 #1
0
ファイル: dsm.py プロジェクト: afcarl/textylon
 def __init__(self, encoding='utf-8'):
     self.Words = Counter()
     self.IndVoc = Vocabulary()
     self.CtxVoc = Vocabulary()
     self.CoocMat = Matrix()
     self.numLines = 0
     self.encoding = encoding
コード例 #2
0
ファイル: externalvectors.py プロジェクト: afcarl/textylon
def loadExternalRepresentationRepLab(textFile, encoding='utf-8', dimension=100):
    """
    this function loads an external representation IndVoc and CoocMatrix
    the format of the file like this:
    first line is
    vocabulary_count dimensionality
    the other lines are of the follwoin format
    word value value .... value
    and the number of values is the dimensionality in the first line
    """
    IndVoc = Vocabulary()
    CoocMat = Matrix()
    wordID = 0
    with codecs.open(textFile, 'r', encoding) as f:
        for line in f:
            fields = line.split('\t')
            word = fields[0]
            IndVoc.set(word, wordID)
            wordID += 1
    
    wordID = 0       
    with codecs.open(textFile, 'r', encoding) as f:
        for line in f:
            fields = line.split('\t')
            word = fields[0]
            vector = map(float, fields[1].split(','))
            wordID += 1
        
    # the first line is the vocabulary size and the representation dimensionality
    lines = f.readlines()
    theFirstTime = True
    reprDict = {}
    vocabSize = 0
    dimensionality = 0
    matrix = None
    CoocMat.makematrix(vocabSize, dimensionality)
    numRows = 0
    for line in lines:
        numRows += 1
        if theFirstTime:
            theFirstTime = False
            fs = line.split()
            vocabSize = int(fs[0])
            dimensionality = int(fs[1])
            continue
        fields = line.split()
        word = fields[0]
        l = fields[1:] 
        vector = np.array(map(float, l))
        if numRows == 2:
            matrix = vector
        else:
            matrix = np.vstack((matrix, vector))
        IndVoc.set(word, IndVoc.getlength())
        
    CoocMat.makeMatrixFromDense(matrix)
    return IndVoc, CoocMat
コード例 #3
0
def loadExternalRepresentation(textFile):
    """
    this function loads an external representation IndVoc and CoocMatrix
    the format of the file like this:
    first line is
    vocabulary_count dimensionality
    the other lines are of the follwoin format
    word value value .... value
    and the number of values is the dimensionality in the first line
    """
    IndVoc = Vocabulary()
    CoocMat = Matrix()

    f = open(textFile, 'r')
    # the first line is the vocabulary size and the representation dimensionality
    lines = f.readlines()
    theFirstTime = True
    reprDict = {}
    vocabSize = 0
    dimensionality = 0
    matrix = None
    q = Queue.Queue()
    threads = []
    lock = threading.Lock()
    outQ = Queue.PriorityQueue()
    numRows = 0
    for line in lines:
        numRows += 1
        if theFirstTime:
            theFirstTime = False
            fs = line.split()
            vocabSize = int(fs[0])
            dimensionality = int(fs[1])
            continue
        q.put(line)

    for i in range(0, 20):
        threads.append(String2VectorThread(q, IndVoc, outQ, lock))
    while True:
        time.sleep(30)
        print("%d is the size of the queue" % outQ.qsize())
        if outQ.empty():
            break
    print("building a matrix from indivisual rows...")
    FirstTime = True
    matrix = None
    while not outQ.empty():
        if FirstTime:
            matrix = outQ.get()
            FirstTime = False
        else:
            matrix = np.vstack(matrix, outQ.get())

    CoocMat.makeMatrixFromDense(matrix)
    print("finished building the matrix.")
    return IndVoc, CoocMat
コード例 #4
0
def dsm(infile, win, index_minf, index_maxf, ctx_minf, ctx_maxf):
    global IndVoc
    IndVoc = Vocabulary()
    global CtxVoc
    CtxVoc = Vocabulary()
    global CoocMat
    CoocMat = Matrix()
    print "Started: " + strftime("%H:%M:%S", gmtime())
    inp = open(infile, "r")
    update_vocabulary(index_minf, index_maxf, ctx_minf, ctx_maxf)
    print IndVoc.getlength(), CtxVoc.getlength()
    CoocMat.makematrix(IndVoc.getlength(), CtxVoc.getlength())
    line_nr = 0
    for line in inp.readlines():
        cnt = 0
        wrdlst = line.split()
        for wrd in wrdlst:
            if IndVoc.lookup(wrd):
                # count co-occurrences to the left
                ctx = 1
                while ctx <= win:
                    if (cnt - ctx) >= 0:
                        c = wrdlst[cnt - ctx]
                        update_counts(c, wrd, ctx_minf, ctx_maxf)
                        ctx += 1
                    else:
                        ctx = win + 1
        # count co-occurrences to the right
                ctx = 1
                while ctx <= win:
                    if (cnt + ctx) < len(wrdlst):
                        c = wrdlst[cnt + ctx]
                        update_counts(c, wrd, ctx_minf, ctx_maxf)
                        ctx += 1
                    else:
                        ctx = win + 1
            cnt += 1
        line_nr += 1
    inp.close()
    print "Finished: " + strftime("%H:%M:%S", gmtime())
コード例 #5
0
def loadExternalRepresentation(textFile):
    """
    this function loads an external representation IndVoc and CoocMatrix
    the format of the file like this:
    first line is
    vocabulary_count dimensionality
    the other lines are of the follwoin format
    word value value .... value
    and the number of values is the dimensionality in the first line
    """
    global IndVoc
    IndVoc = Vocabulary()
    global CtxVoc
    CtxVoc = Vocabulary()
    global CoocMat
    CoocMat = Matrix()

    f = open(textFile, 'r')
    # the first line is the vocabulary size and the representation dimensionality
    lines = f.readlines()
    theFirstTime = True
    reprDict = {}
    vocabSize = 0
    dimensionality = 0
    for line in lines:
        if theFirstTime:
            theFirstTime = False
            fs = line.split()
            vocabSize = int(fs[0])
            dimensionality = int(fs[1])
            CoocMat.makematrix(vocabSize, dimensionality)
            continue
        fields = line.split()
        word = fields[0]
        l = fields[1:]
        vector = np.array(map(float, l))
        i = 0
        for v in np.nditer(vector):
            CoocMat.update(IndVoc.getlength(), i, v)
        IndVoc.set(word, IndVoc.getlength())