def norm_dist(d_name): global cur cur = sh.worksheet(d_name) #how many VDCs? no_vdc = len(cur.col_values(3))-2 print 'len is: ' + str(no_vdc) print 'dist: ' + d_name #get values vals = cur.get_all_values() for col in cols_reg: col[1] = cur.col_values(vals[1].index(col[0])+1)[2:] print col[1] #normalize values in each column for col in cols_reg: col[2]= normalize.norm(col[1]) #get output cells and normalize for col in cols_reg: index = vals[1].index(col[0]+"_normalized") out_vals = cur.range(cols[index]+"3:"+cols[index]+str(no_vdc+1)) it = 0 for cell in out_vals: if it < len(col[2]): cell.value = col[2][it] it+=1 print out_vals cur.update_cells(out_vals)
def LinearRegressionModel(dataPath, label, normalize, character, master, ispca): pca_n = 2 sc = SparkContext(master) data = sc.textFile(dataPath) # not RDD data ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)]))) if label == 0: ndata = ndata.map(lambda line: line[::-1]) if normalize == 1: test_data = norm(ndata.collect()) norm_data = sc.parallelize(test_data) train_data = norm_data.map(lambda part: lbp(part[0], part[1])) #raw_data = data.map(lambda line: line.split(character)) else: test_data = ndata.map(lambda part: (part[len(part) - 1], part[0:len(part) - 1])).collect() train_data = ndata.map(lambda part: lbp(part[len(part) - 1], part[0: len(part) - 1])) if ispca == 1: pca = PCA(n_components = pca_n) pca_train = [test_data[i][1] for i in range(len(test_data))] pca_data = pca.fit(pca_train).transform(pca_train) test = [] for i in range(len(pca_data)): test.append([test_data[i][0], pca_data[i]]) train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1])) test_data = test model_lr = lr.train(train_data) err_lr = 0.0 size = len(train_data.collect()) for i in range(size): err_lr = err_lr + abs(model_lr.predict(test_data[i][1]) - test_data[i][0]) print "result:", err_lr/size String = "Linear Regression Result:\n" String = String + str(model_lr.weights) + '\n' String = String + "Error: " + str(err_lr / size) sc.stop() return String
import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np from normalize import norm data = np.genfromtxt('./sylvester/out.csv', delimiter=',') data = norm(data) pm25 = list(data[:, 0]) pm10 = list(data[:, 1]) plt.scatter(pm25, pm10, c=np.arange(len(pm10)), cmap='gnuplot', s=0.1) cbar = plt.colorbar() ticks = ['16:00', '18:00', '20:00', '22:00', '24:00', '02:00'] cbar.set_ticks(range(len(pm10))[::60 * 60 * 2]) #cbar.set_label('time in sconds') cbar.ax.set_yticklabels(ticks) plt.xlabel('pm2.5 in µg/m³') plt.ylabel('pm10 in µg/m³') plt.show()
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')] #image_feature = [0.] * IMAGE_FEATURE_LEN assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % ( img, len(image_feature)) is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) if FLAGS.small_feature: image_features.append(image_feature) else: #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.big_feature_image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_imgtextfile(file): """ since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to convert and store image binaries from imatext(preprocess) """ out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) assert len(pic_info_map) > 0 with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] if img not in pic_info_map: continue img_text = l[-1] encoded_image = urllib.unquote_plus(img_text) text_info = pic_info_map[img] texts = text_info.split('\x01') is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) ##--well too big for encoded_image and so not consider evaluation? TODO #image_features.append(encoded_image) if FLAGS.image_dir: #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
print('min_count:', FLAGS.min_count, 'most_common:', FLAGS.most_common) num = 0 for line in sys.stdin: if num % 10000 == 0: print(num, file=sys.stderr) l = line.rstrip().split('\t') try: texts = l[1].split('\x01') except Exception: print(line, file=sys.stderr) #texts = l[2].split('\x01') for text in texts: text = normalize.norm(text) words = segmentor.Segment(text, FLAGS.seg_method) if num % 10000 == 0: print(text, '|'.join(words), len(words), file=sys.stderr) counter.add(START_WORD) for word in words: counter.add(word) if word.isdigit(): counter.add('<NUM>') counter.add(END_WORD) num += 1 counter.add(START_WORD) print(FLAGS.out_dir, file=sys.stderr) if not FLAGS.vocab_name:
from normalize import Normalize as norm from arOp_Grey import ArOpGrey as ar from geometricsOperations import GeometricsOperations as geo from arOp_Color import ArOpColor as arC e = geo() n = norm() ar = ar() arC = arC() #Tutaj trzeba wstawić kolorowe zdjęcia #n.geometricColorNormalize("temp_img/raster_grey.png", "temp_img/raster_grey_2.png") #n.geometricGreyNormalize("temp_img/raster_grey.png", "temp_img/raster_grey_2.png") #ar.sumImageWithNumber("temp_img/zdj1.jpg", 40) #ar.sumImageWithImage("temp_img/2/1.jpg", "temp_img/2/2.jpg") #ar.multiplyImgWithNumber("temp_img/zdj1.jpg", 100) #ar.multiplyImgWithImg("temp_img/2/1.jpg", "temp_img/2/2.jpg") #ar.mixImagesWithRate("temp_img/2/1.jpg", "temp_img/2/2.jpg", 0.4) #ar.escalateImg("temp_img/2/1.jpg", 2) #ar.divideImgByNumber("temp_img/2/2.jpg", 50) #ar.divideImgByImg("temp_img/2/1.jpg", "temp_img/2/2.jpg") #ar.extractImg("temp_img/2/1.jpg", 3) #ar.logImg("temp_img/2/1.jpg") #arC.sumImgWithNumber("temp_img/cukierki.tiff", 20) #arC.sumImgWithImg("temp_img/2/1.jpg", "temp_img/2/2.jpg") #arC.multiplyImgWithNumber("temp_img/2/1.jpg", 150) #arC.multiplyImgWithImg("temp_img/2/1.jpg", "temp_img/2/2.jpg") #arC.mixImagesWithRate("temp_img/2/1.jpg", "temp_img/2/2.jpg", 0.4) #arC.escalateImg("temp_img/2/1.jpg", 2) #arC.divideImgByNumber("temp_img/2/1.jpg", 300)
import numpy as np import matplotlib.pyplot as plt from scipy.optimize import curve_fit from normalize import norm def func(x, a, b, c): return a * np.exp(-b * x) + 2.5 data = np.genfromtxt('./messung1/out.csv', delimiter=',') ydata25 = np.array(norm(data[2500:4000])[:, 0], np.float32) ydata10 = np.array(norm(data[2500:4000])[:, 1], np.float32) xdata = np.array(np.arange(ydata25.shape[0]), np.float32) plt.figure(1) plt.subplot(211) popt25, pcov25 = curve_fit(func, xdata, ydata25) popt10, pcov10 = curve_fit(func, xdata, ydata10) plt.plot(ydata25, label='pm2.5') plt.plot(func(xdata, *popt25), label='pm2.5-regression') plt.plot(ydata10, label='pm10') plt.plot(func(xdata, *popt10), label='pm10-regression') plt.ylabel('linear-scale') plt.xticks([]) plt.legend()
open('raw_data/people_mapping.txt','r') as fspk, \ open('train_'+data_name+'.txt','w') as ftrain, \ open('train_'+data_name+'.info','wb') as ftrain_info, \ open('train_'+data_name+'.sp5','wb') as ftrain_kb, \ open('dev_'+data_name+'.txt','w') as fvalid, \ open('dev_'+data_name+'.info','wb') as fvalid_info, \ open('dev_'+data_name+'.sp5','wb') as fvalid_kb, \ open('test_'+data_name+'.txt','w') as ftest, \ open('test_'+data_name+'.info','wb') as ftest_info, \ open('test_'+data_name+'.sp5','wb') as ftest_kb: nodes, edges = read_kb.read_in_graph('.') if not os.path.exists('temp_norm.dict'): spk_map = normalize.get_spk_map(fspk) print(spk_map) normalized_contexts, alligned_entities, alligned_dykb = normalize.norm(fchat, nodes, edges, spk_map) pickle.dump((normalized_contexts, alligned_entities, alligned_dykb), open('temp_norm.dict','wb')) else: (normalized_contexts, alligned_entities, alligned_dykb) = pickle.load(open('temp_norm.dict','rb')) entities_occurs = split.shuffle( nodes, list(sorted(edges.keys())), \ normalized_contexts, alligned_entities, alligned_dykb, \ [ftrain, fvalid, ftest], \ [ftrain_info, fvalid_info, ftest_info], \ [ftrain_kb, fvalid_kb, ftest_kb], \ [0.85,0.05,0.1]) with open('for_kb_cloud.txt','w') as fkb: num_kb_appears = 0 kb_counts = {} for n in nodes: box = []
def SVMModel(dataPath, label, max_label, min_label, character, master, normalize, ispca): pca_n = 2 sc = SparkContext(master) data = sc.textFile(dataPath) mid_label = (float(max_label) + float(min_label)) / 2.0 print data.map(lambda line: line.split(character)).collect() ndata = data.map(lambda line: line.split(character)).map(lambda part: (map(lambda x: float(x) ,part[0: len(part)]))) if label == 0: ndata = ndata.map(lambda line: line[::-1]) if normalize == 1: test_data = norm(ndata.collect()) norm_data = sc.parallelize(test_data) train_data = norm_data.map(lambda part: lbp([1.0 if float(part[0]) > mid_label else 0.0][0], part[1])) test_data = norm_data.map(lambda part: ([1.0 if float(part[0]) > mid_label else 0.0][0], part[1])).collect() else: train_data = ndata.map(lambda part: lbp([1.0 if float(len(part) - 1) > mid_label else 0.0][0], part[0: len(part) - 1])) test_data = ndata.map(lambda part: ([1.0 if float(part[len(part) - 1]) > mid_label else 0.0][0], part[0:len(part) - 1])).collect() if ispca == 1: pca = PCA(n_components = pca_n) pca_train = [test_data[i][1] for i in range(len(test_data))] pca_data = pca.fit(pca_train).transform(pca_train) test = [] for i in range(len(pca_data)): test.append([test_data[i][0], pca_data[i]]) train_data = sc.parallelize(test).map(lambda part: lbp(part[0], part[1])) test_data = test model_svm = svm.train(train_data) acc_svm = 0 err_lrg = 0.0 size = len(train_data.collect()) for i in range(size): if model_svm.predict(test_data[i][1]) == test_data[i][0]: acc_svm += 1 String = "SVM Result:\n" String = String + str(model_svm.weights) + "\n" String = String + str((float(acc_svm)/ float(size)) * 100) + "%" x = [] y = [] showpic = 0 if len(test_data[0][1]) == 2: ispca = 1 if ispca == 1: for i in range(size): if test_data[i][0] == 0.0: plt.plot(test_data[i][1][0], test_data[i][1][1], 'ro', color = 'r', markersize = 8) elif test_data[i][0] == 1.0: plt.plot(test_data[i][1][0], test_data[i][1][1], 'ro', color = 'b', markersize = 8) test = sc.parallelize(test_data) max_axis = test.map(lambda part: part[1][0]).max() min_axis = test.map(lambda part: part[1][0]).min() plt.plot([min_axis, max_axis], [max_axis * model_svm.weights[0] + model_svm.weights[1], min_axis * model_svm.weights[0] + model_svm.weights[1]], 'g-', linewidth= 2) plt.savefig('result.jpg') plt.close('all') showpic = 1 sc.stop() return (showpic, String)