def __init__(self, fName, vocabF, batchSize, seqLen, minContext, rand=False): self.seqLen = seqLen + 1 self.minContext = minContext self.batchSize = batchSize self.rand = rand self.pos = 0 self.vocab = utils.loadDict(vocabF) self.invVocab = utils.invertDict(self.vocab) realLen = self.seqLen - minContext dat = open(fName).read() dat = dat[:int(len(dat) / (realLen)) * realLen] dat = nlp.applyVocab(dat, self.vocab) dat = np.asarray(list(dat)) dat = dat.reshape(-1, realLen) dat = np.hstack((dat[:-1], dat[1:, :minContext])) self.batches = int(np.floor(dat.shape[0] / batchSize)) self.dat = dat[:(self.batches * batchSize)] self.m = len(self.dat.flat)
def __init__(self, fName, vocabF, batchSize, seqLen, minContext, rand=False, mode='ptb'): self.seqLen = seqLen+1 self.minContext = minContext self.batchSize = batchSize self.rand = rand self.pos = 0 self.vocab = utils.loadDict(vocabF) self.invVocab = utils.invertDict(self.vocab) realLen = self.seqLen - minContext if mode == 'french': dat = open(fName,encoding = "ISO-8859-1").read() dat = dat.lower() remove = string.punctuation remove = remove.replace("'","") pattern = r"[{}]".format(remove) dat = re.sub(pattern,"",dat) dat = dat[:int(len(dat)/(realLen))*realLen] elif mode == 'english_words': dat = open(fName).read() dat = dat.lower() elif mode == 'ptb': dat = open(fName).read() dat = dat[:int(len(dat)/(realLen))*realLen] dat = nlp.applyVocab(dat, self.vocab,mode) dat = np.asarray(list(dat)) if mode =='english_words': dat = dat[:int(len(dat)/(realLen))*realLen] dat = dat.reshape(-1, realLen) dat = np.hstack((dat[:-1], dat[1:, :minContext])) self.batches = int(np.floor(dat.shape[0] / batchSize)) self.dat = dat[:(self.batches*batchSize)] self.m = len(self.dat.flat)
def countsToFile(): array1 = np.zeros(2 * (2 * ut.N)**2) array2 = np.zeros(2 * (2 * ut.N)**2) array = [""] * 2 * (2 * ut.N)**2 data, tot_num = ut.loadDict(file) hists = ut.histFromDict(data) for h in hists: array[name_hash(h)] = h array1[name_hash(h)] = len(hists[h]) array2[name_hash(h)] = sum(2 * fun(np.asarray(hists[h]))) with open("hash.txt", "w") as f: for i in range(len(array)): f.write(str(i) + ": " + str(array[i]) + "\n") with open("counts1.txt", "w") as f: for i in range(len(array1)): f.write(str(i) + ": " + str(array1[i]) + "\n") with open("counts2.txt", "w") as f: for i in range(len(array2)): f.write(str(i) + ": " + str(array2[i]) + "\n")
def runAllHists(): # data_list, tot_num = ut.loadAllFiles() # hists = ut.histFromDictList(data_list) data, tot_num = ut.loadDict(file) hists = ut.histFromDict(data) # saveToCache(data_list, "data") saveToCache(hists, "hists") saveToCache(tot_num, "tot_num") # for h in hists: # w = np.ones_like(hists[h])/len(hists[h]) # ut.plotHist((h, hists[h]), bin_num=100, toFile=True, weights=w, show=False) # plt.figure() # for h in hists: # ut.plotToOneHist((h, hists[h]), bin_num=100) # plt.close() # write_log(tot_num) print("Total num: ", tot_num) return hists, tot_num
with open('./preprocessing/config.json') as config_file: data = json.load(config_file) cnx = connection.MySQLConnection(user=data['mysql']['user'], password=data['mysql']['passwd'], host=data['mysql']['host'], database=data['mysql']['db']) cursor = cnx.cursor() query = ( "SELECT id, description FROM cs_entry_comment WHERE entry_id IS NOT NULL AND status IN (1,2)" ) cursor.execute(query) dictionary = loadDict() f = open('preprocessing/normalize_like.txt', 'r') review_views = {} for line in f: tmp = line.split(",") review_views[tmp[0]] = tmp[1] convertedData = open('converted_data.txt', 'w') for (id, description) in cursor: print(id) num_words = 0 doc = {} for w in dictionary: doc[w] = 0