import dataop as dop a = dop.load_var("audio_stem_join") d = dop.load_var("dicts/audio_stem_all_dict") d = d[0] #34910 inp = dop.create_indexMatrix(a, d, 5000) dop.save_var(inp, "inputs/audio_stem") #a = dop.load_var("audio_join") #d = dop.load_var("dicts/audio_alternatives_dict") #d = d[0] #inp = dop.create_indexMatrix(a,d,5) #dop.save_var(inp,"inputs/audio")
from gtrain import gtrain from data_model_CNN import Sentence_CNN from data_model_CNN import DataForCNN import dataop as dop tr = dop.load_var("inputs/audio_stem_tr") tr_l = dop.load_var("inputs/audio_stem_tr_l") tst = dop.load_var("inputs/audio_stem_tst") tst_l = dop.load_var("inputs/audio_stem_tst_l") data = DataForCNN(tr, tr_l, tst, tst_l) model = Sentence_CNN(10, 5000) gtrain(model, data, out_dir="runs\CNN", evaluate_every=1000, checkpoint_every=1000, num_epochs=10000)
import dataop as dop import re audio = dop.load_var("audio_alternatives") new = list() for slide in audio : txt = "" for alt in slide : txt += " " + alt new.append(re.sub("\d","",txt)) new = new dop.save_var(new,"audio_join")
import dataop as dop audio = dop.load_var("audio_alternatives") new = list() for slide in audio: txt = "" for alt in slide: txt += " " + alt new.append(dop.edit_text(txt)) new = dop.edit_text(new) dop.save_var(new, "audio_join") audio = dop.load_var("audio_stem_all") new = list() for slide in audio: txt = "" for alt in slide: txt += " " + alt new.append(dop.edit_text(txt)) new = dop.edit_text(new) dop.save_var(new, "audio_stem_join")
import dataop as dop import numpy as np d = dop.load_var("dicts/audio_stem_all_dict") a = np.array(list(d[2].values())) occ2num = dict() for u in set(a): occ2num[str(u)] = int(np.sum(a >= u)) dop.save_var(occ2num, "occ2num_audio_stem") I = dop.tfidf( dop.create_freq_input(dop.load_var("audio_stem_all"), d[0], size=occ2num[str(15)])) I, _ = dop.normalize_lin(I, []) I = I[:, np.sum(I != 0, 0) > 0] dop.save_np(I, "inputs/audio_stem_tfidf_norm") #---ocr--- d = dop.load_var("dicts/ocr_3_dict") a = np.array(list(d[2].values())) occ2num = dict() for u in set(a): occ2num[str(u)] = int(np.sum(a >= u)) dop.save_var(occ2num, "occ2num_ocr") I = dop.tfidf( dop.create_freq_input(dop.load_var("audio_stem_all"), d[0],
import dataop as dop import numpy as np from math import floor inp = np.array(dop.load_var("inputs/audio_stem")) l = np.array(dop.load_var("label")) num_classes = max(l) p = floor(len(l)/10) out = np.zeros([len(l),num_classes]) for i in range(len(l)) : out[i][l[i]-1]=1 rp = np.random.permutation(len(inp)) inp = inp[rp] out = out[rp] dop.save_var(rp.tolist(),"rp") dop.save_var(inp[:p].tolist(),"inputs/audio_stem_tst") dop.save_var(out[:p].tolist(),"inputs/audio_stem_tst_l") dop.save_var(inp[p:].tolist(),"inputs/audio_stem_tr") dop.save_var(out[p:].tolist(),"inputs/audio_stem_tr_l")
import dataop as dop #--audio_stem-- a = dop.load_var("audio_stem_all") a = dop.edit_text(a) d = dop.load_var("dicts/audio_stem_all_dict") d = d[0] inp = dop.create_indexMatrix(a, d, 50000) dop.save_var(inp, "inputs/audio_stem") #--ocr-- a = dop.load_var("ocr") a = dop.edit_text(a) d = dop.load_var("dicts/ocr_1_dict") d = d[0] inp = dop.create_indexMatrix(a, d, char_level=True) dop.save_var(inp, "inputs/ocr_char") #--audio_all-- a = dop.load_var("audio_join") a = dop.edit_text(a) d = dop.load_var("dicts/audio_join_dict") d = d[0] inp = dop.create_indexMatrix(a, d) dop.save_var(inp, "inputs/audio_all")