def _process_buff(self, buff): """ :param buff: :return: """ len_cntr = Counter() for sent in buff: len_cntr[len(sent)] += 1 bkts_splits = KMeans(self.n_bkts, len_cntr).splits # Count the sents length # Use k-means to splits the sents into n_bkts parts # reset bucket size # map the lenth to bkts id prev_size = -1 for bkt_idx, size in enumerate(bkts_splits): self.buckets[bkt_idx].set_size(size) self.len2bkts.update( zip(range(prev_size + 1, size + 1), [bkt_idx] * (size - prev_size))) prev_size = size # map all length from min to max to bkts id # some of lengths do not appear in the data set for sent in buff: # Add the sent to the specific bucket according to their length # Construct the sent into example first # And then push them into buckets bkt_idx = self.len2bkts[len(sent)] example = Example(sent, self._config) example.convert(self.vocabs) # save to bucket idx = self.buckets[bkt_idx].add(example) self.id2position.append((bkt_idx, idx))
def construct_examples(raw_data): examples = [] for o in raw_data: d = o['reviews'][0] if None in [d['text'], d['rating']]: continue if d['title'] is None: d['title'] = "" review = d['title'] + " " + " STOP START ".join(d['text']) if 'gender' in o and 'birth_year' in o: if o['gender'] is None or o['birth_year'] is None: continue gen = map_gender[o['gender']] age = bucket_age(o['birth_year'], d['date']) if age != None: meta = set() if gen: meta.add(GENDER) if age: meta.add(BIRTH) ex = Example(review, int(d['rating']) - 1, metadata=meta) if len(ex.get_sentence()) == 0: continue examples.append(ex) return examples
def read_data(filename): examples = [] for line in open(filename): line = line.strip().split("\t") topic = line[0] age = line[1] gender = line[2] user = line[3] text = line[4] if topic != "None": meta = set() if age == "1": meta.add(0) if gender == "f": meta.add(1) topic = int(topic) examples.append(Example(text, topic, meta)) return examples
def preprocess_data(model, raw): in_vocabulary = model.in_vocabulary out_vocabulary = model.out_vocabulary domain_stats_vocab = model.domain_stats_vocab lexicon = model.lexicon data = [] for raw_ex in raw: x_str, y_str, x_orig, y_orig, x_orig_same, y_orig_same, src_domain, pos, src_domain_stats = raw_ex d_inds = [ domain_stats_vocab.domain_to_index[src_domain] for x in out_vocabulary.sentence_to_indices(y_str) ] ex = Example(x_str, y_str, x_orig, y_orig, x_orig_same, y_orig_same, src_domain, pos, in_vocabulary, out_vocabulary, d_inds, lexicon, reverse_input=OPTIONS.reverse_input) data.append(ex) return data
def Make_feature_file(authorIdPaperIds, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author, feature_function_list, to_file): example_list = [] dimension = 0 process_bar = pyprind.ProgPercent(len(authorIdPaperIds)) for authorIdPaperId in authorIdPaperIds: process_bar.update() features = [ feature_function(authorIdPaperId, dict_coauthor, dict_paperIdAuthorId_to_name_aff, PaperAuthor, Author) for feature_function in feature_function_list ] #合并特征 feature = util.mergeFeatures(features) dimension = feature.dimension #特征target target = authorIdPaperId.label if target is None: target = "-1" #example example = Example(target, feature) # example.comment = json.dumps({"paperId": authorIdPaperId.paperId, "authorId": authorIdPaperId.authorId}) example.comment = "%s %s" % (authorIdPaperId.paperId, authorIdPaperId.authorId) example_list.append(example) util.write_example_list_to_file(example_list, to_file) # to arff file util.write_example_list_to_arff_file(example_list, dimension, to_file + ".arff")
def preprocess_data(model, raw): in_vocabulary = model.in_vocabulary out_vocabulary = model.out_vocabulary #if OPTIONS.model=='attn2hist': # domain_size = model.domain_size #else: #print(len(DOMAINS)) domain_size = len(DOMAINS) #doma lexicon = model.lexicon #print('lexicon:',lexicon) #print('raw:',raw) #print('in_vocabulary:',in_vocabulary) #print('out_vocabulary:',out_vocabulary) data = [] for raw_ex in raw: x_str, y_str, z_str = raw_ex ex = Example(x_str, y_str, z_str, in_vocabulary, out_vocabulary, domain_size, lexicon, reverse_input=OPTIONS.reverse_input) data.append(ex) return data
def read(filename): """ Read the Iris csv file and construct the Example objects. The file is assumed to contain 5 columns: sepal length, sepal width, petal length, petal width and training label. The first 4 columns will be used as features in the feature vector constructed. :param filename: Name of the csv file containing iris data :return: A list of Example objects """ data = [] labels = [] with open(filename, 'r', encoding='utf-8') as label_file: for each_line in label_file: example_data = each_line.strip().split(',') num_features = len(example_data) feature_vector = np.zeros(num_features, float) feature_vector[0] = 1 # bias for count in range(num_features - 1): feature = float(example_data[count]) feature_vector[count + 1] = feature label = example_data[num_features - 1] example = Example(label, feature_vector) data.append(example) return data
def accept(): if request.method == 'POST': print("HALP!!") if 'place' not in request.form: return "No place in the input!" if 'stage' not in request.form: return "No stage in the input!" place = str(request.form['place']) stage = str(request.form['stage']) if ((len(place) < 1) or (len(stage) < 1)): method = str(request.form['method']) flash("Please enter proper place and stage", "danger") return redirect(url_for("home")) # formattedString = changeRingingStringChecker(place, stage) # audioMaker(formattedString) # imageMaker(formattedString) print("Building example") formattedString = notationReader(place, stage) example = Example('audio/'+formattedString+'.wav', 'images/'+formattedString+'.jpg') methodPlayer(formattedString) methodDrawer(formattedString) return render_template('results.html', example=example, formattedString=formattedString) if request.method == 'GET': return "A get request to accept?!?"
def get_batch_generator(self, mode="train", single_pass=False): """Get a generator which is to yield a Batch Instance mode: can be train, eval, infer single_pass: if True, then this """ assert mode in ["train", "eval", "infer"], "model can be {train, eval, infer}" mode_data = self.video_data[self.video_data["mode"] == mode] video_captions = zip(mode_data["video_path"].values, mode_data["caption"].values) example_num = len(video_captions) print "mode = {mode} and the sample num is {sample_num}".format(mode=mode, sample_num=example_num) while True: random.shuffle(video_captions) for start, end in zip( range(0, example_num, self.hparams.batch_size), range(self.hparams.batch_size, example_num, self.hparams.batch_size)): example_list = video_captions[start: end] def _load_video_feat(video_path): return np.load(video_path) example_list = map(lambda example: Example(_load_video_feat(example[0]), example[1], vocab=self.hparams.word2id), example_list) yield Batch(example_list) if self.single_pass or single_pass: print "infer mode: no more data" break
def read(images, labels): """ Read the digits image and label files and build the example objects with the feature vectors. :param images: Name of the text file containing the images :param labels: Name of the text file containing the labels :return: A list of Example objects """ data = [] with open(labels, 'r', encoding='utf-8') as label_file: for each_line in label_file: example = Example(int(each_line.strip())) data.append(example) image_row = 0 count = 0 with open(images, 'r', encoding='utf-8') as image_file: for each_line in image_file: if image_row == 0: feature_vector = np.zeros(NUM_FEATURES, int) feature_vector[0] = 1 # bias feature = 1 for each_char in each_line[0:IMAGE_SIZE]: feature_vector[feature] = 0 if each_char == ' ' else 1 feature += 1 image_row = (image_row + 1) % IMAGE_SIZE if image_row == 0: data[count].fvector = feature_vector count += 1 data[-1].fvector = feature_vector return data
def run_eval(): import csv assert OPTIONS.load_file is not None assert OPTIONS.input is not None train_raw = load_dataset(OPTIONS.train_data) random.seed(OPTIONS.model_seed) numpy.random.seed(OPTIONS.model_seed) spec = init_spec(train_raw) model = get_model(spec) reader = csv.reader(OPTIONS.input, delimiter='\t') writer = csv.writer(OPTIONS.output, delimiter='\t') header = next(reader) #assert header == ['id', 'input'] writer.writerow(['id', 'input', 'output', 'score']) for id, input in reader: s = input.strip() example = Example(s, '', model.in_vocabulary, model.out_vocabulary, model.lexicon, reverse_input=OPTIONS.reverse_input) deriv = decode(model, example)[0] output = " ".join(deriv.y_toks).strip() score = deriv.p writer.writerow([id, input, output, score])
def _load_examples(self, klass, example_group): for example in self._examples_in(klass): tags = example._tags if self._is_pending_example(example) or self._is_pending_example_group(example_group): example_group.append(PendingExample( example, tags=tags, module=self.module)) else: example_group.append( Example(example, tags=tags, module=self.module))
def make_examples(filename): examples = [] with open(filename) as f: raw = json.load(f) for raw_example in raw: # TODO: support multiple anser examples.append( Example(input=preprocess(raw_example['text']), denotation=raw_example['ans_simple'])) return examples
def upload_audio(): if request.method == 'POST': # check if the post request has the file part if 'file' not in request.files: flash('No file part', 'danger') return redirect(url_for('home')) file = request.files['file'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': flash('No selected file', 'danger') return redirect(url_for('home')) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) filelocation = os.path.join(app.config['UPLOAD_FOLDER'], filename) print(filelocation) #get variables from the user's form layer = request.form['layer'] channel = int(request.form['channel']) # path_to_audio = request.form['path_to_audio'] iterations = int(request.form['iterations']) octaves = int(request.form['octaves']) path_to_audio = "./audio/" + str(filename) audio_name = filename.split('.')[0] print("Audio name is: ") print(audio_name) print("The forms data:") print(layer, channel, path_to_audio, iterations, octaves) #run the function return_object = deepdream_func(layer, channel, path_to_audio, iterations, octaves, audio_name) if (return_object == -1): #return error message print("Showing error flash?!?") flash( 'Please select a channel that is in range for this layer', 'danger') return redirect(url_for('home')) else: print(return_object) #return image ex = Example( os.path.join('/audio', return_object['audio_filename']), "/images/in.jpg", return_object['audio_filename_new'], "/images/out.jpg") return render_template('results.html', example=ex) # return 'upload complete' elif request.method == 'GET': return send_from_directory("uploads", "the_books.mp3") return
def examples(self): all_examples = [] entities, intents = self.parse() for name, intent in intents.items(): for text in intent['examples']: example = Example(text, name, entities) all_examples.append(example) return all_examples
def add_examples(self, list_of_examples): """ Adds all of the provided Examples to BasicGrid. :param list_of_examples: A list of Examples. """ for example_as_a_list in list_of_examples: if self.check_if_proper_example_coordinates( coordinates=example_as_a_list[:-1]): self.basic_grid.add_example_to_grid( example=Example(example_as_a_list))
def update(self, list_of_examples): """ Adds the Examples to the example_queue. :param list_of_examples: A list of new Examples. """ for example_as_a_list in list_of_examples: if self.check_if_proper_example_coordinates( coordinates=example_as_a_list[:-1]): self.example_queue.append( Example(observation=example_as_a_list)) self.batch_update()
def test(self, example): """ Given a list of coordinates and a class id at the last index, creates an Example object and classifies it. :param example: A list of coordinates and a class id at the last index. :return: Class id. """ if not is_array_numeric(array=example[:-1]): print("Observation coordinates have to be numeric") return None example = Example(observation=example) return self.classify(example_coords=example.coords)
def addTabEdit(self, path): if path is not None: e = Example() highLighter = MyHighlighter(self.parent.symbolWidget, parent=e.edit.document()) self.listofHighlighters.append(highLighter) try: with open(path, 'r') as f: text = f.read() e.edit.setText(text) except Exception: pass self.dictOfTabsEdits.addPath(path, e) self.addTab(e, getFileName(path)) e.edit.cursorPositionChanged.connect(self.parent.setValuesOfFormat) else: e = Example() highLighter = MyHighlighter(self.parent.symbolWidget, parent=e.edit.document()) self.listofHighlighters.append(highLighter) self.dictOfTabsEdits.addPath(path, e) self.addTab(e, getFileName(path)) e.edit.cursorPositionChanged.connect(self.parent.setValuesOfFormat)
def reject(self, test_example): u = self.u f = None e = Example() if type(test_example) == type(e): f = test_example.features_u else: f = test_example r = f.dot(u.T) return r
def run_shell(model): print('==== Neural Network Semantic Parsing REPL ====') print('') print('Enter an utterance:') while True: s = raw_input('> ').strip() example = Example(s, '', '', {}, model.in_vocabulary, model.out_vocabulary, reverse_input=OPTIONS.reverse_input) print('') print('Result:') preds = decode(model, example) for prob, y_toks in preds[:10]: y_str = ' '.join(y_toks) print(' [p=%f] %s' % (prob, y_str)) print('')
def classify(self, test_example): w = self.w f = None e = Example() if type(test_example) == type(e): f = test_example.features_w else: f = test_example h = f.dot(w.T) return h
def change(self, insts): exams = [] for inst in insts: example = Example() for w in inst.words: if w in self.word_AlphaBet.list: example.wordIndexs.append(self.word_AlphaBet.dict[w]) else: example.wordIndexs.append(self.hyperpara.unknow_id) for l in inst.labels: labelId = self.label_AlphaBet.dict[l] example.labelIndexs.append(labelId) exams.append(example) return exams #每句话的句子和标签的ID
def post_query(): query = bottle.request.params.get('query') print 'Received query: "%s"' % query example = Example(query, '', '', {}, model.in_vocabulary, model.out_vocabulary, reverse_input=OPTIONS.reverse_input) preds = decode(model, example) lines = ['<b>Query: "%s"</b>' % query, '<ul>'] for i, deriv in enumerate(preds[:10]): y_str = ' '.join(deriv.y_toks) lines.append('<li> %d. [p=%f] %s' % (i, deriv.p, y_str)) lines.append(make_heatmap(query, y_str, deriv.attention_list, deriv.copy_list)) lines.append('</ul>') content = '\n'.join(lines) return bottle.template('main', prompt='Enter a new query', content=content)
def get_from(folder): neg_file = "{}/neg_examples".format(folder) pos_file = "{}/pos_examples".format(folder) examples = [] sys.stderr.write(" Loading negative examples...\n") for line in open(neg_file): line = line.strip() if line: examples.append(Example(line, 0)) if len(examples) > 1000: break sys.stderr.write(" Done.\n") sys.stderr.write(" Loading positive examples...\n") for line in open(pos_file): line = line.strip() if line: examples.append(Example(line, 1)) if len(examples) > 2000: break sys.stderr.write(" Done.\n") return examples
def change(self, file_train): i, j, x, y = self.extract_feature(file_train) all_examples = [] for idx in range(len(i)): m = i[idx] example = Example() for a in m: if a in self.word_AlphaBet.dict: example.m_word_indexes.append(self.word_AlphaBet.dict[a]) label_list = [0, 0, 0, 0, 0] b = int(j[idx]) label_list[b] = 1 example.m_label_index = label_list all_examples.append(example) return all_examples
def preprocess_data(model, raw): in_vocabulary = model.in_vocabulary out_vocabulary = model.out_vocabulary lexicon = model.lexicon #print('lexicon:',lexicon) #print('raw:',raw) #print('in_vocabulary:',in_vocabulary) #print('out_vocabulary:',out_vocabulary) data = [] for raw_ex in raw: x_str, y_str = raw_ex ex = Example(x_str, y_str, in_vocabulary, out_vocabulary, lexicon, reverse_input=OPTIONS.reverse_input) data.append(ex) return data
def examples(self): class_labels = self._readmap("image_class_labels") images = self._readmap("images") train_test_split = self._readmap("train_test_split") results = [] for id in images: results.append( Example(id=int(id), path=images[id][:-4], species=int(class_labels[id]), is_training=int(train_test_split[id]) == 1, datadir=self.path)) return results
def get_dataset(k = 4): keys = ["source", "url", "title", "image", "category", "description", "rank", "pubdate"] filename = "../datasets/newsspace200.xml" xml_tree = ET.parse(filename) root = xml_tree.getroot() categories = ["World", "Entertainment", "Sports", "Business"] # "Top Stories", "Sci/Tech", "Top News", "Europe", "Health", "Italia", "U.S."] label_map = dict(zip(categories, range(len(categories)))) #sources = ["Yahoo Business", "Reuters Business", "Washington Post Business", "BBC News Business"] #source_map = dict(zip(sources, range(len(sources)))) examples = [] i = 0 d = [] for c in root: assert(c.tag == keys[i%len(keys)]) d.append(c.text) if len(d) == len(keys): if d[4] in label_map: description = d[2] if d[5] is not None: description += " " + d[5] ex = Example(preprocess(description), label = label_map[d[4]], #metadata = [source_map[d[0]]]) ) examples.append(ex) d = [] i += 1 examples = ner.tags_NE(examples, "ag_corpus", k=k) random.shuffle(examples) l = len(examples) // 10 test, dev, train = examples[:l], examples[l:2*l], examples[2*l:] return train, dev, test
def preprocess_data(model, raw): in_vocabulary = model.in_vocabulary out_vocabulary = model.out_vocabulary domain_vocabulary = model.domain_vocabulary lexicon = model.lexicon data = [] for raw_ex in raw: x_str, y_str, sub_domain = raw_ex ex = Example(x_str, y_str, in_vocabulary, out_vocabulary, domain_vocabulary, lexicon, reverse_input=OPTIONS.reverse_input, sub_domain=sub_domain) data.append(ex) return data