def __init__(self, encoding='utf-8'): self.Words = Counter() self.IndVoc = Vocabulary() self.CtxVoc = Vocabulary() self.CoocMat = Matrix() self.numLines = 0 self.encoding = encoding
def loadExternalRepresentationRepLab(textFile, encoding='utf-8', dimension=100): """ this function loads an external representation IndVoc and CoocMatrix the format of the file like this: first line is vocabulary_count dimensionality the other lines are of the follwoin format word value value .... value and the number of values is the dimensionality in the first line """ IndVoc = Vocabulary() CoocMat = Matrix() wordID = 0 with codecs.open(textFile, 'r', encoding) as f: for line in f: fields = line.split('\t') word = fields[0] IndVoc.set(word, wordID) wordID += 1 wordID = 0 with codecs.open(textFile, 'r', encoding) as f: for line in f: fields = line.split('\t') word = fields[0] vector = map(float, fields[1].split(',')) wordID += 1 # the first line is the vocabulary size and the representation dimensionality lines = f.readlines() theFirstTime = True reprDict = {} vocabSize = 0 dimensionality = 0 matrix = None CoocMat.makematrix(vocabSize, dimensionality) numRows = 0 for line in lines: numRows += 1 if theFirstTime: theFirstTime = False fs = line.split() vocabSize = int(fs[0]) dimensionality = int(fs[1]) continue fields = line.split() word = fields[0] l = fields[1:] vector = np.array(map(float, l)) if numRows == 2: matrix = vector else: matrix = np.vstack((matrix, vector)) IndVoc.set(word, IndVoc.getlength()) CoocMat.makeMatrixFromDense(matrix) return IndVoc, CoocMat
def loadExternalRepresentation(textFile): """ this function loads an external representation IndVoc and CoocMatrix the format of the file like this: first line is vocabulary_count dimensionality the other lines are of the follwoin format word value value .... value and the number of values is the dimensionality in the first line """ IndVoc = Vocabulary() CoocMat = Matrix() f = open(textFile, 'r') # the first line is the vocabulary size and the representation dimensionality lines = f.readlines() theFirstTime = True reprDict = {} vocabSize = 0 dimensionality = 0 matrix = None q = Queue.Queue() threads = [] lock = threading.Lock() outQ = Queue.PriorityQueue() numRows = 0 for line in lines: numRows += 1 if theFirstTime: theFirstTime = False fs = line.split() vocabSize = int(fs[0]) dimensionality = int(fs[1]) continue q.put(line) for i in range(0, 20): threads.append(String2VectorThread(q, IndVoc, outQ, lock)) while True: time.sleep(30) print("%d is the size of the queue" % outQ.qsize()) if outQ.empty(): break print("building a matrix from indivisual rows...") FirstTime = True matrix = None while not outQ.empty(): if FirstTime: matrix = outQ.get() FirstTime = False else: matrix = np.vstack(matrix, outQ.get()) CoocMat.makeMatrixFromDense(matrix) print("finished building the matrix.") return IndVoc, CoocMat
def dsm(infile, win, index_minf, index_maxf, ctx_minf, ctx_maxf): global IndVoc IndVoc = Vocabulary() global CtxVoc CtxVoc = Vocabulary() global CoocMat CoocMat = Matrix() print "Started: " + strftime("%H:%M:%S", gmtime()) inp = open(infile, "r") update_vocabulary(index_minf, index_maxf, ctx_minf, ctx_maxf) print IndVoc.getlength(), CtxVoc.getlength() CoocMat.makematrix(IndVoc.getlength(), CtxVoc.getlength()) line_nr = 0 for line in inp.readlines(): cnt = 0 wrdlst = line.split() for wrd in wrdlst: if IndVoc.lookup(wrd): # count co-occurrences to the left ctx = 1 while ctx <= win: if (cnt - ctx) >= 0: c = wrdlst[cnt - ctx] update_counts(c, wrd, ctx_minf, ctx_maxf) ctx += 1 else: ctx = win + 1 # count co-occurrences to the right ctx = 1 while ctx <= win: if (cnt + ctx) < len(wrdlst): c = wrdlst[cnt + ctx] update_counts(c, wrd, ctx_minf, ctx_maxf) ctx += 1 else: ctx = win + 1 cnt += 1 line_nr += 1 inp.close() print "Finished: " + strftime("%H:%M:%S", gmtime())
def loadExternalRepresentation(textFile): """ this function loads an external representation IndVoc and CoocMatrix the format of the file like this: first line is vocabulary_count dimensionality the other lines are of the follwoin format word value value .... value and the number of values is the dimensionality in the first line """ global IndVoc IndVoc = Vocabulary() global CtxVoc CtxVoc = Vocabulary() global CoocMat CoocMat = Matrix() f = open(textFile, 'r') # the first line is the vocabulary size and the representation dimensionality lines = f.readlines() theFirstTime = True reprDict = {} vocabSize = 0 dimensionality = 0 for line in lines: if theFirstTime: theFirstTime = False fs = line.split() vocabSize = int(fs[0]) dimensionality = int(fs[1]) CoocMat.makematrix(vocabSize, dimensionality) continue fields = line.split() word = fields[0] l = fields[1:] vector = np.array(map(float, l)) i = 0 for v in np.nditer(vector): CoocMat.update(IndVoc.getlength(), i, v) IndVoc.set(word, IndVoc.getlength())