def main(): # LOAD TRAIN SET dataset_train = Dataset.DatasetReview() dataset_train.load_review_from_csv(train_set) # LOAD TEST SET dataset_test = Dataset.DatasetReview() dataset_test.load_review_from_csv(test_set) fe_tfidf = TfidfFeatureExtractor(size=500) fe_w2v = WordEmbeddingFeatureExtractor(infile=w2v_vec_path, binary=False, dimen=500) fe_sswe_w2v = WordEmbeddingFeatureExtractor(infile=sswe_w2v, binary=False, dimen=500, sswe=1) fe_sswe = SennaFeatureExtractor(infile=sswe_senna_vectors, vocabfile=sswe_senna_vocabs, dimen=500) feature_extractors = [fe_tfidf, fe_w2v, fe_sswe_w2v, fe_sswe] ev = Evaluator() print "\n**** CROSS VALIDATION EVALUATION (CORPUS: DATASET) ****\n" model = Classifier(models="nn") kfold = KFold(n_splits=10) ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) model = Classifier(models="nn") ev.create_evaluation_result(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) print "\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n" model = Classifier(models="nn") ev.eval_with_test_set(model, feature_extractors=feature_extractors, training_set=dataset_train, test_set=dataset_test)
def run(verbose=True): ds = Dataset(name = '2010 Census Tracts', cached = datetime.utcnow().replace(tzinfo=utc), cache_max_age = 1000, remote_id_field = 'GEOID10', name_field = 'NAMELSAD10', lat_field = 'INTPTLAT10', lon_field = 'INTPTLON10', field1_en = 'Land Area', field1_name = 'ALAND10', field2_en = 'Water Area', field2_name = 'AWATER10') tract_mapping = { 'remote_id' : ds.remote_id_field, 'name' : ds.name_field, 'lat' : ds.lat_field, 'lon' : ds.lon_field, 'field1' : ds.field1_name, 'field2' : ds.field2_name, 'mpoly' : 'MULTIPOLYGON', } tract_shp = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/tl_2010_36_tract10.shp')) lm = LayerMapping(MapPolygon, tract_shp, tract_mapping, transform=False, encoding='iso-8859-1') lm.save(strict=True, verbose=verbose) ds.save() MapPolygon.objects.filter(dataset = None).update(dataset = ds)
def api_dataset_index(user, values=None): if request.method == 'GET': return json.dumps(serializers.user_datasets(user)) else: validation = Dataset.validate(values) if validation == True: dataset = Dataset.from_values(user=user, values=values) return response_success(serializers.dataset(dataset)) else: return response_error(validation)
def dataset_upload(request): user = request.user if request.method == 'POST': if user.is_authenticated(): file = request.FILES.get('filename', '') file_name = file.name dest_dir = os.path.join(settings.USR_DATASET_ROOT, user.username) if not os.path.exists(dest_dir): os.makedirs(dest_dir) full_path = os.path.join(dest_dir, file_name) rel_path = os.path.join(user.username, file_name) destination = open(full_path, "wb+") for chunk in file.chunks(): destination.write(chunk) destination.close() description = request.POST['description'] access = request.POST['access'] tbl_separator = {"tab":'\t', "space":' ', "comma":',', "semicolon":';'} sep_str = request.POST['sep'] sep = tbl_separator[sep_str] header = request.POST['header'] if header == 'yes': header = True; elif header == 'no': header = False; ## a simple check size = 0 for line in open(full_path): size += 1 dim = len(line.split(sep)) if header == True: size -= 1 # exclude the header line new_dataset = Dataset(owner = user, path = rel_path, name = file_name, dim = dim, size = size, description = description, access = access, sep = sep_str, header = header) new_dataset.save() notice = "Congratulations! Your dataset has been successfully uploaded." # return render_to_response('dataset/success.html', RequestContext(request, {'dataset': new_dataset, 'notice': notice})) return HttpResponseRedirect('/datasets/%s/' % new_dataset.id) else: notice = "You must be logged in to upload datasets" form = UploadDatasetForm() return render_to_response('dataset/upload.html', RequestContext(request, {'form': form, 'notice': notice})) else: form = UploadDatasetForm() return render_to_response('dataset/upload.html', RequestContext(request, {'form': form}))
def test_is_not_empty_if_annotated(self): m1 = Molecule(name='test', sum_formula="C1H2O3") m1.save() s1 = Standard(molecule=m1) s1.save() d1 = Dataset() d1.save() fs1 = FragmentationSpectrum(ms1_intensity=42, dataset=d1, standard=s1) fs1.save() molecule_table, molecules_with_spectra = self.get_table_and_count() self.assertEqual(len(molecule_table.rows), 1) self.assertEqual(molecules_with_spectra, 1)
def example(): dataset = Dataset() meta, data = dataset.at(0, xy=False) symbol = Symbol(meta, data) symbols = ( symbol['count'].apply(np.log) == symbol[['age', 'smoke']].interact(lambda x: x[0]*x[1], name='age_smoke') + symbol['age'] + symbol['smoke'] + symbol['drug'] + symbol['partners'] + symbol['cesd'] ) return symbols
def test_make_FragmentationSpectrum_with_centroids(self): d1 = Dataset(name='Dataset1') d1.save() f1 = FragmentationSpectrum(precursor_mz='123.456', spec_num=0, dataset=d1) mzs = [10., 20, 50] ints = [1., 1., 1.] f1.set_centroid_mzs(mzs) f1.set_centroid_ints(ints) f1.save() np.testing.assert_array_almost_equal(mzs, f1.centroid_mzs) np.testing.assert_array_almost_equal(ints, f1.centroid_ints)
def get_dataset(datastore, id): ''' Creates a dataset object from the .valid file ''' try: datastore.download(id + '/uploads/.valid') except AttributeError: return None with open(id + '/uploads/.valid', 'r') as validfile: if validfile.read() == id: dataset = Dataset(id) dataset.datastore = datastore return dataset
def test_xic_and_standard_and_adduct(self): # create some datasets d1 = Dataset(name='Dataset1') d1.save() a1 = Adduct(nM=1, delta_formula='-H', charge=-1) a1.save() m1 = Molecule(name='TestMolecule1', sum_formula="C1H2O3") m1.save() s1 = Standard(molecule=m1, inventory_id="0") s1.save() # create some xics x1 = Xic(mz=60.993, dataset=d1) xic = [1.0, 2.0, 3.0, 4.0, 5.0] x1.set_xic(xic) x1.standard = s1 x1.adduct = a1 x1.save() self.assertEqual(Xic.objects.all().count(), 1) self.assertEqual(Dataset.objects.all().count(), 1) self.assertEqual(Standard.objects.all().count(), 1) # mass check with self.assertRaises(ValueError): x1.mz = 123.993 x1.save() x1.check_mass() def test_xic_mass_filter(self): d1 = Dataset(name='dataset') d1.save() mz = 60.993 # three larger Xic(mz=mz + 5., dataset=d1).save() Xic(mz=mz + 10., dataset=d1).save() Xic(mz=mz + 15., dataset=d1).save() # three approx equal Xic(mz=mz + 0.005, dataset=d1).save() Xic(mz=mz + 0.0, dataset=d1).save() Xic(mz=mz - 0.0015, dataset=d1).save() # three smaller Xic(mz=mz - 5., dataset=d1).save() Xic(mz=mz - 10., dataset=d1).save() Xic(mz=mz - 15., dataset=d1).save() # three approx equal from another dataset d2 = Dataset(name='dataset2') d2.save() Xic(mz=mz + 0.005, dataset=d2).save() Xic(mz=mz + 0.0, dataset=d2).save() Xic(mz=mz - 0.0015, dataset=d2).save() self.assertEqual(Xic.objects.all().count(), 12) xics = Xic.objects.all().filter(dataset=d1).filter(mz__gte=mz + 0.01).filter(mz__lte=mz - 0.01) self.assertEqual(xics.objects.all().count(), 3)
def get_dataset(datastore, id): ''' Creates a dataset object from the .valid file ''' try: valid_path = '{0}/uploads/.valid'.format(id) valid_file = datastore.read(valid_path) except AttributeError: return None if valid_file.read() == id: dataset = Dataset(id) dataset.datastore = datastore return dataset
def dataset(): print(request.method) form = Dataset(request.form) #root = Tk() #root.filename = filedialog.asksaveasfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*"))) #name = asksaveasfilename() #with open(name + '.csv', 'w', newline='') as csvfile: # create = csv.writer(csvfile) # create.writerow(["adam","deboosere"]) if not form.validate_on_submit(): return render_template('dataset.html', form=form) if request.method == 'POST': #return 'Submitted!' render_template('dataset.html', form=form)
def new_dataset(): ''' Create a unique url for this dataset to work under Create a folder on S3 using this url ''' # Make a new dataset object id = str(uuid.uuid4()) dataset = Dataset(id) dataset.datastore = make_datastore(app.config['DATASTORE']) # Write a verifying file to prove we created these folders validname = '{0}/uploads/.valid'.format(dataset.id) dataset.datastore.write(validname, StringIO(dataset.id)) return redirect('/datasets/' + dataset.id)
def add_dataset(): body = request.get_json() name = body.get('name') type = body.get('type') description = body.get('description') provider_id = body.get('provider_id') try: dataset = Dataset(name, provider_id, type, description) dataset.insert() return jsonify({'success': True, 'dataset_id': dataset.id}) except Exception as es: print(es) abort(422)
def create_dataset(sessionconfig, params): session = sessionconfig[0] config = sessionconfig[1] checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\ params(name=params['name']).first() if(checkDataset is None): dataset = Dataset(name=params['name'], identifier=params['identifier'], description=params['description'], details=params['details'], module_parameters='', created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id='') shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH']) session.add(dataset) session.commit() objs = [] if(config['BACKEND'] == 'hdfs'): objs.append((config['MODULES_DIR'] + 'sqlite.db', config['METADATA_LOCAL_PATH'])) elif(config['BACKEND'] == 'swift'): objs.append(('sqlite.db', config['METADATA_LOCAL_PATH'])) elif(config['BACKEND'] == 'nfs'): pass saveObjsBackend(objs, config['BACKEND'], config) else: raise RuntimeError("The dataset with name " + params['name'] + " already exists")
def create_featureset(sessionconfig, params): session = sessionconfig[0] config = sessionconfig[1] modulename = params['modulename'] analysisMod = session.query(Analysis).from_statement(text("SELECT * FROM analysis where name=:name")).\ params(name=modulename).first() if(analysisMod): # Check if the module exists # module_id = analysisMod.id checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\ params(name=params['name']).first() if(checkDataset is None): dataset = Dataset(name=params['name'], identifier='', description=params['description'], details=params['details'], module_parameters=params['module_parameters'], created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id=analysisMod.id) shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH']) session.add(dataset) session.commit() else: raise RuntimeError('The feature set with the name ' + params['name'] + ' already exists') else: raise RuntimeError('No Such Module')
def insert_dataset(nex_session, fw, x, parent_dataset_id): print "DATASET:", x y = Dataset(format_name=x['format_name'], display_name=x['display_name'], obj_url="/dataset/" + x['format_name'], source_id=x['source_id'], dbxref_id=x.get('dbxref_id'), dbxref_type=x.get('dbxref_type'), date_public=x.get('date_public'), parent_dataset_id=x.get('parent_dataset_id'), assay_id=x.get('assay_id'), channel_count=x.get('channel_count'), sample_count=x.get('sample_count'), is_in_spell=x.get('is_in_spell'), is_in_browser=x.get('is_in_browser'), description=x.get('description'), created_by=CREATED_BY) nex_session.add(y) nex_session.flush() nex_session.refresh(y) fw.write("Insert dataset: " + x['display_name'] + " into database\n") return y.dataset_id
def fetchDataset (self): """Fetch a dataset to the list of cacheable datasets""" token = self.dataset_name.split('-')[0] try: json_info = json.loads(getURL('http://{}/ocpca/{}/info/'.format(settings.SERVER, token))) except Exception as e: logger.error("Token {} doesn not exist on the backend {}".format(token, settings.SERVER)) raise NDTILECACHEError("Token {} doesn not exist on the backend {}".format(token, settings.SERVER)) ximagesize, yimagesize, zimagesize = json_info['dataset']['imagesize']['0'] xoffset, yoffset, zoffset = json_info['dataset']['offset']['0'] xvoxelres, yvoxelres, zvoxelres = json_info['dataset']['voxelres']['0'] scalinglevels = json_info['dataset']['scalinglevels'] scalingoption = ND_scalingtoint[json_info['dataset']['scaling']] starttime, endtime = json_info['dataset']['timerange'] project_name = json_info['project']['name'] s3backend = json_info['project']['s3backend'] self.ds = Dataset(dataset_name=self.dataset_name, ximagesize=ximagesize, yimagesize=yimagesize, zimagesize=zimagesize, xoffset=xoffset, yoffset=yoffset, zoffset=zoffset, xvoxelres=xvoxelres, yvoxelres=yvoxelres, zvoxelres=zvoxelres, scalingoption=scalingoption, scalinglevels=scalinglevels, starttime=starttime, endtime=endtime, project_name=project_name, s3backend=s3backend) self.ds.save() for channel_name in json_info['channels'].keys(): channel_name = channel_name dataset_id = self.dataset_name channel_type = json_info['channels'][channel_name]['channel_type'] channel_datatype = json_info['channels'][channel_name]['datatype'] startwindow, endwindow = json_info['channels'][channel_name]['windowrange'] propagate = json_info['channels'][channel_name]['propagate'] readonly = json_info['channels'][channel_name]['readonly'] ch = Channel(channel_name=channel_name, dataset=self.ds, channel_type=channel_type, channel_datatype=channel_datatype, startwindow=startwindow, endwindow=endwindow, propagate=propagate, readonly=readonly) ch.save()
def datasetr(): print(request.method) form = Dataset(request.form) nameMale = request.form["manname"] femalName = request.form["womenname"] language = request.form["language"] time = request.form["time"] min_tweet = request.form["min_number"] (tweets, y) = getTweets2(int(time), nameMale, femalName, language, int(min_tweet)) # tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] #form.tweet. = 'ada' #y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] y2 = np.array(y) f = np.count_nonzero(y2 == 0) h = np.count_nonzero(y2 == 1) detail = [ 'Nombre de tweets: ' + str(len(y)), 'Femme: ' + str(f), 'Homme: ' + str(h) ] #flash(str(request.form)) if request.method == 'POST': return render_template('datasetresult.html', form=form, tweets=tweets, detail=detail)
def main(train_set, output_path): # LOAD TRAIN SET dataset_train = Dataset.DatasetReview() dataset_train.load_review_from_csv(train_set) # LOAD TEST SET # dataset_test = Dataset.DatasetReview() # dataset_test.load_review_from_csv(test_set) # preprocessor = DatasetPreprocessor() # dataset = preprocessor.fold_cases_d(dataset) # dataset = preprocessor.remove_punctuations_d(dataset) # dataset = preprocessor.convert_numbers_d(dataset) # dataset.export_only_contents("../Test/dataset.txt") # fe = BagFeatureExtractor(dataset.get_contents()) # fe.build() # fe.save_vocab("../Test/vocab.txt") # dataset.export_formatted_dataset("formatted_dataset_wow.tsv") if not os.path.exists(output_path): os.makedirs(output_path) fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), dimen=300) fe.save_model_to_file(output_path + "vectors_full_300.txt", vocabfile=output_path + "vocab_full_300.txt", binary=False)
def test_add_xic(self): m1 = Molecule(name='TestMolecule1', sum_formula="C1H2O3") m1.save() s1 = Standard(molecule=m1, inventory_id="0") s1.save() a1 = Adduct(nM=1, delta_formula='+H+K', charge=-2) a1.save() d1 = Dataset(name='Dataset1') d1.save() d1.standards_present.add(s1) d1.adducts_present.add(a1) x1 = Xic(mz='0.0', dataset=d1) xic = [1.0, 2.0, 3.0, 4.0, 5.0] x1.set_xic(xic) x1.save() self.assertEqual(Xic.objects.all().count(), 1) np.testing.assert_array_almost_equal(xic, x1.xic)
def create_dataset(self, user, name, parent=None): if parent: parent = self.session.query(Dataset).filter( Dataset.name == parent).first() path = hashlib.sha256(user.name + name).hexdigest() dataset = Dataset(name=name, owner=user, path=path, parent=[parent]) abspath = os.path.join(self.path, path) os.makedirs(abspath)
def save_form_data(form, file_name): """Save the data associated with an uploaded dataset.""" data_dict = dict((field, form[field]) for field in form) # Now delete the unnecessary keys... del data_dict['submit'], data_dict['csrf'] data_dict['file_name'] = file_name dataset = Dataset(**data_dict) db.session.add(dataset) db.session.commit()
def setUpTestData(cls): d1 = Dataset(name='Dataset1') d1.save() m1 = Molecule(sum_formula='H2O') m1.save() m2 = Molecule(sum_formula='O2') m2.save() s1 = Standard(molecule=m1) s1.save() s2 = Standard(molecule=m1) s2.save() s3 = Standard(molecule=m2) s3.save() FragmentationSpectrum.objects.create(precursor_mz='123.456', spec_num=0, dataset=d1, standard=s1) FragmentationSpectrum.objects.create(precursor_mz='123.45', spec_num=0, dataset=d1, standard=s2) FragmentationSpectrum.objects.create(precursor_mz='123.4', spec_num=0, dataset=d1, standard=s3) cls.m_onespectrum = m2 cls.m_twospectra = m1
def initvalidateddataset(): lines = list(open(data_validated_csv).readlines()) for line in lines: audio_path,lenght,text = line.split(',') text = text.replace('\n','') new_data= Dataset() new_data.text = text new_data.audio_lenght = lenght new_data.file_path= audio_path new_data.file_with_user = 0 # 1 if user validating this instance new_data.instance_validated = 1 #1 if human validated this instance new_data.instance_valid = 1 # 1 if instance is ok new_data.user_validated = 'edresson' db.session.add(new_data) db.session.commit()
def get(self): # Check if datasets are loaded in datastore # Items in datastore d = Dataset.query().count() # Items in CDB q = "select count(*) as c from resource_staging" + \ " where ipt is true and networks like '%VertNet%';" c = cartodb_query(q)[0]['c'] # Number of reports stored in the datastore num_reports = Report.query().count() periods = Period.query() num_periods = periods.count() periods_done = Period.query(Period.status == "done") num_periods_done = periods_done.count() periods_progress = Period.query(Period.status == "in progress") num_periods_progress = periods_progress.count() periods_failed = Period.query(Period.status == "failed") num_periods_failed = periods_failed.count() resp = { "Datastore integrity": [ {"Datasets in CartoDB": c}, {"Datasets in the Datastore": d} ], "Report periods": [ {"Stored periods": num_periods}, {"Stored reports": num_reports}, {"Periods completed": num_periods_done}, {"Periods in progress": num_periods_progress}, {"Periods failed": num_periods_failed}, ] } if c != d or c == 0: dataset_setup_url = "http://%s/setup_datasets" % _HOSTNAME resp["Datastore integrity"].append({"URL for dataset setup": dataset_setup_url}) if num_periods > 0: links_to_periods = ["http://%s/status/period/%s" % (_HOSTNAME, x.key.id()) for x in periods.fetch()] resp["Report periods"].append({"Links to periods": links_to_periods}) if num_periods_done > 0: resp['Report periods'].append({'List of periods done': [x.period.strftime("%Y-%m") for x in periods_done.fetch()]}) if num_periods_progress > 0: resp['Report periods'].append({'List of periods in progress': [x.period.strftime("%Y-%m") for x in periods_progress.fetch()]}) if num_periods_failed > 0: resp['Report periods'].append({'List of periods failed': [x.period.strftime("%Y-%m") for x in periods_failed.fetch()]}) self.response.headers['content-type'] = "application/json" self.response.write(json.dumps(resp))
def test_add_dataset(self): # create standards m1 = Molecule(name='TestMolecule1', sum_formula="C1H2O3") m1.save() m2 = Molecule(name='TestMolecule1', sum_formula="C2H2O3") m2.save() s1 = Standard(molecule=m1, inventory_id="0") s1.save() s2 = Standard(molecule=m2, inventory_id="1") s2.save() # create adduct a1 = Adduct(nM=1, delta_formula='+H+K', charge=-2) a1.save() # create a dataset d1 = Dataset(name='Dataset1') d1.save() d1.standards_present.add(s1) d1.standards_present.add(s2) d1.adducts_present.add(a1) self.assertEqual(Dataset.objects.all().count(), 1) self.assertEqual(Dataset.objects.all()[0].standards_present.count(), 2)
def savecsv(): print(request.method) form = Dataset(request.form) if 'tweets' in request.form: tweets = request.form['tweets'] tweetsR = eval(tweets) #return str(tweetsR[0]) #name = asksaveasfilename() if request.form['typef'] == 'json': root = Tk() root.filename = filedialog.asksaveasfilename( initialdir="/", title="Destination du Json", filetypes=(("JSON", "*.json"), ("all files", "*.*"))) #root.mainloop() root.destroy() with open(root.filename + '.json', 'w') as out_f: json.dump(tweetsR, out_f) out_f.close() else: root = Tk() root.filename = filedialog.asksaveasfilename( initialdir="/", title="Destination du dataset", filetypes=(("xlsx", "*.xlsx"), ("all files", "*.*"))) #root.mainloop() root.destroy() workbook = xlsxwriter.Workbook(root.filename + '.xlsx') worksheet = workbook.add_worksheet() rowEx = 1 worksheet.write(0, 0, 'text') worksheet.write(0, 1, 'gender') for tweet in tweetsR: worksheet.write(rowEx, 0, tweet[0]) worksheet.write(rowEx, 1, tweet[1]) rowEx += 1 workbook.close() #with open(name + '.csv', 'w', newline='') as csvfile: # create = csv.writer(csvfile) # create.writerow(tweets) # tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] #form.tweet. = 'ada' #y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"] #flash(str(request.form)) if request.method == 'POST': return render_template('datasetresult.html', form=form, tweets=tweetsR)
def test_xic_mass_filter(self): d1 = Dataset(name='dataset') d1.save() mz = 60.993 # three larger Xic(mz=mz + 5., dataset=d1).save() Xic(mz=mz + 10., dataset=d1).save() Xic(mz=mz + 15., dataset=d1).save() # three approx equal Xic(mz=mz + 0.005, dataset=d1).save() Xic(mz=mz + 0.0, dataset=d1).save() Xic(mz=mz - 0.0015, dataset=d1).save() # three smaller Xic(mz=mz - 5., dataset=d1).save() Xic(mz=mz - 10., dataset=d1).save() Xic(mz=mz - 15., dataset=d1).save() # three approx equal from another dataset d2 = Dataset(name='dataset2') d2.save() Xic(mz=mz + 0.005, dataset=d2).save() Xic(mz=mz + 0.0, dataset=d2).save() Xic(mz=mz - 0.0015, dataset=d2).save() self.assertEqual(Xic.objects.all().count(), 12) xics = Xic.objects.all().filter(dataset=d1).filter(mz__gte=mz + 0.01).filter(mz__lte=mz - 0.01) self.assertEqual(xics.objects.all().count(), 3)
def main(infile): # LOAD TRAIN SET dataset_train = Dataset.DatasetReview() dataset_train.load_review_from_csv(infile) dataset_train.export_only_contents("sentences_new.txt") # fe = BagFeatureExtractor(dataset.get_contents()) # fe.build() # fe.save_vocab("../Test/vocab.txt") dataset_train.export_formatted_dataset("formatted_dataset_wow.tsv")
def init_db(): # import all modules here that might define models so that # they will be registered properly on the metadata. Otherwise # you will have to import them first before calling init_db() from models import Department, User, Role, Dataset Base.metadata.drop_all(bind=engine) Base.metadata.create_all(bind=engine) # Create the fixtures engineering = Department(name='Engineering') db_session.add(engineering) hr = Department(name='Human Resources') db_session.add(hr) manager = Role(name='manager') db_session.add(manager) engineer = Role(name='engineer') db_session.add(engineer) peter = User(name='Peter', department=engineering, role=engineer) db_session.add(peter) roy = User(name='Roy', department=engineering, role=engineer) db_session.add(roy) tracy = User(name='Tracy', department=hr, role=manager) db_session.add(tracy) # Dataset import random from random import randint from faker import Faker fake = Faker('en_US') nPoints = 11 # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] } data = { 'x': [int(i) for i in range(nPoints)], 'z': [float(i) for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] } test_data1 = Dataset(name='dataset1', description='First dataset', table_name='data1', enabled=True, raw=data) db_session.add(test_data1) # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] } # test_data2 = Dataset(name='dataset2', description='Second dataset', table_name='data2', enabled=False, raw=data) # db_session.add(test_data2) db_session.commit()
def example_1_2_6(): # 数据预处理 meta, data= Dataset().at(4, xy=False) mapper = collections.Counter(data['id']) for id in mapper: index = data['id'] == id data.loc[index, 'weight'] = data.loc[index, 'weight'].mean() # 线性模型的符号表示 symbol = Symbol(meta, data) symbols = ( symbol['weight'] == symbol['gender'] + symbol['dose'] + symbol['id'].apply(lambda xs: [mapper[x] for x in xs], 'size') ) return symbols
def freeze_dataset(self, id_or_name): """ Crea una imagen temporal del contenido de un dataset. Args: ==== - id_or_name: - str(). - Id o Nombre del dataset que deseo freezar. Returns: ======= - Dataset: Si el objeto es localizable & "Freezable". Exceptions: ========== - ValueError: - id_or_name esta unicode o str pero es del len == 0. - TypeError: - id_or_name no es un str o unicode. """ from models import Dataset stored_dataset = self.retrieve_dataset_metadata(id_or_name) if stored_dataset: freezed_dataset = { "license_title": stored_dataset['license_title'], "maintainer": stored_dataset['maintainer'], "private": stored_dataset['private'], "maintainer_email": stored_dataset['maintainer_email'], "id": stored_dataset['id'], "owner_org": stored_dataset['owner_org'], "author": stored_dataset['author'], "isopen": stored_dataset['isopen'], "author_email": stored_dataset['author_email'], "state": stored_dataset['state'], "license_id": stored_dataset['license_id'], "type": stored_dataset['type'], "groups": [g['name'] for g in stored_dataset['groups']], "creator_user_id": stored_dataset['creator_user_id'], "name": stored_dataset['name'], "url": stored_dataset['url'], "notes": stored_dataset['notes'], "title": stored_dataset['title'], "license_url": stored_dataset['license_url'] } return Dataset(datadict=freezed_dataset, _distribution_literal=True, _distributions=stored_dataset['resources'])
def diff_datasets(dataset_a=None, dataset_b=None): """ Compara dos datasets y retorna la diferencia aditiva de ambos. Cuando se realiza la diferencia, el valor que prevalece es el de dataset_b. Args: ==== - dataset_a: - Dataset(). - Solo admite ser de tipo Dataset(). - dataset_b: - Dataset(). - Solo admite ser de tipo Dataset(). Returns: ======= - Dataset(). Exceptions: ========== TypeError: - Uno o ambos argumentos, no son de clase Dataset. """ from models import Dataset for v in [dataset_a, dataset_b]: if not isinstance(v, Dataset): raise TypeError( 'Para comparar los datasets ambos deben ser de clase Dataset.' ) diff_ds = {} omit_this_keys = ['required_keys', 'context'] for k, v in dataset_a.__dict__.items(): if k not in omit_this_keys: if v != dataset_b.__dict__[k]: diff_ds.update({ k: dataset_b.__dict__[k] if len(dataset_b.__dict__[k]) > 0 else v }) else: diff_ds.update({k: v}) return Dataset(datadict=diff_ds, _distributions=dataset_a.__dict__['resources'], _distribution_literal=True)
def add_testdata_to_db(dataset, items, datatype): count = db.session.query( Dataset, Dataset.name).filter(Dataset.name == dataset).all() if len(count) > 0: return 'exists' new_dataset = Dataset(name=dataset, datatype=datatype) for item in items: testitem = Item( dataset_name=dataset, item=json.dumps(item), status='available', timestamp=datetime.now(), ) new_dataset.items.append(testitem) db.session.add(testitem) db.session.add(new_dataset) db.session.commit() return 'added'
def datasets(): if not current_user.is_authenticated: return redirect(url_for('no_access')) datasets = Dataset.query.filter(Dataset.user_id == current_user.id).all() for ds in datasets: ds.distinctive_name = ds.distinctive_name or ds.filename if ds.distinctive_name == ds.filename: ds.display_filename = '' else: ds.display_filename = ds.filename model = { 'title': 'Datasets', 'datasets': datasets } form = FileUploadForm() if form.validate_on_submit(): dsFile = form.fileName.data separator = form.separator.data distinctive_name = form.distinctive_name.data filename = secure_filename(dsFile.filename) guid = str(uuid.uuid4()) dsFile.seek(0) dt = dsFile.read() dbDs = Dataset(filename, guid, g.user, datetime.datetime.utcnow(), separator, distinctive_name, dt) db.session.add(dbDs) db.session.commit() return redirect(url_for('datasets')) model['form'] = form return render_template('datasets.html', model = model)
def post(self): urlfetch.set_default_fetch_deadline(60) self.response.headers['Content-Type'] = 'application/json' q = "select gbifdatasetid, icode, orgname, github_orgname, " \ "source_url, github_reponame, url, gbifpublisherid " \ "from resource_staging " \ "where ipt=true and networks like '%VertNet%'" resources = carto_query(q) ds = [] for resource in resources: ds.append(Dataset(id=resource['gbifdatasetid'], **resource)) keys = ndb.put_multi(ds) result = { "datasets processed": len(keys), "message": "success" } self.response.write(json.dumps(result)) return
def dataset_upload(request): if request.method == 'POST': form = UploadFileForm(request.POST, request.FILES) if form.is_valid(): post_dict = dict(request.POST) files_dict = dict(request.FILES) logging.debug(files_dict) logging.debug(post_dict) data = {"adducts": post_dict['adducts'], "standards": post_dict['standards'], "mass_accuracy_ppm": post_dict['mass_accuracy_ppm'][0], "quad_window_mz": post_dict['quad_window_mz'][0], "lc_info": post_dict['lc_info'][0], "ms_info": post_dict['ms_info'][0]} uploaded_file = request.FILES['mzml_file'] base_name, extension = os.path.splitext(uploaded_file.name) d = Dataset(name=uploaded_file.name, processing_finished=False) d.save() mzml_filename = "{}-{}{}".format(base_name, d.id, extension) mzml_filepath = os.path.join(settings.MEDIA_ROOT, mzml_filename) logging.debug("mzML filepath: " + mzml_filepath) logging.debug("original mzML filename: " + uploaded_file.name) with open(mzml_filepath, 'w') as destination: for chunk in uploaded_file.chunks(): destination.write(chunk) d.path = mzml_filepath d.save() tasks.handle_uploaded_files.delay(data, mzml_filepath, d) return redirect('dataset-list') else: form = UploadFileForm(initial={"mass_accuracy_ppm": 10.0, 'quad_window_mz': 1.0}) autocomplete = { 'lc_info': [str(info.content) for info in LcInfo.objects.all()], 'ms_info': [str(info.content) for info in MsInfo.objects.all()], } return render(request, 'mcf_standards_browse/dataset_upload.html', {'form': form, 'autocomplete': autocomplete})
def hazardous_waste(year=2011, verbose=True): try: dataset = Dataset.objects.get(name="Hazardous Waste Sites "+str(year)) dataset.cached = datetime.utcnow().replace(tzinfo=utc) except ObjectDoesNotExist: coor = GeoCoordinates(lat_field="Latitude", lon_field="Longitude") coor.save() names = DatasetNameField(field1_en="Generator Status", field1_name="Generator Status", field2_en="Biennial Report Link", field2_name="Biennial Report Link") names.save() location = Location(street_field="Address", city_field="City", state_field="State", zipcode_field="ZIP Code", county_field="County") dataset = Dataset( name="Hazardous Waste Sites "+str(year), url='/data/ej/'+str(year)+'/', cached=datetime.utcnow().replace(tzinfo=utc), cache_max_age=1000, remote_id_field="Handler ID", name_field="Handler Name", location=location, coordinates=coor, names=names needs_geocoding=False) dataset.save() MapPoint.objects.filter(dataset=dataset).delete() for state in ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']: short_name = 'Envirofacts_Biennial_Report_Search ' + state + '.CSV' path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name)) if not os.path.isfile(path): if verbose: print 'No file %s exists.' % (short_name) short_name = str(year)+' '+state+'.CSV' path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name)) if not os.path.isfile(path): if verbose: print 'No file %s exists.' % (short_name) continue if verbose: print 'Opening file %s' % (short_name) readfile = csv.reader(open(path, 'rb')) # verify row = readfile.next() locs = {} for i in range(len(row)): if row[i] == dataset.remote_id_field: locs['remote_id'] = i elif row[i] == dataset.name_field: locs['name'] = i elif row[i] == dataset.location.street_field: locs['street'] = i elif row[i] == dataset.location.city_field: locs['city'] = i elif row[i] == dataset.location.state_field: locs['state'] = i elif row[i] == dataset.location.zipcode_field: locs['zipcode'] = i elif row[i] == dataset.location.county_field: locs['county'] = i elif row[i] == dataset.coordinates.lat_field: locs['lat'] = i elif row[i] == dataset.coordinates.lon_field: locs['lon'] = i elif row[i] == dataset.names.field1_name: locs['field1'] = i elif row[i] == dataset.names.field2_name: locs['field2'] = i for row in readfile: kwargs = {'dataset': dataset} for key in locs: if key in ['lat', 'lon']: try: kwargs[key] = float(row[locs[key]]) except Exception: kwargs[key] = 0. elif MapPoint._meta.get_field(key).max_length < len(row[locs[key]]): kwargs[key] = row[locs[key]][:MapPoint._meta.get_field(key).max_length] else: kwargs[key] = row[locs[key]] try: kwargs['point'] = Point(kwargs['lon'], kwargs['lat']) except Exception: if verbose: print '\tInvalid lat/long for row: %s' % (row) print '\tLat: %f Lon: %f' % (kwargs['lat'], kwargs['lon']) continue mp = MapPoint(**kwargs) mp.save() if verbose: print 'File "%s" done processing' % (short_name)
def run(verbose=True, year=2010, starting_state=1): yn = '' # https://docs.djangoproject.com/en/1.7/ref/contrib/gis/layermapping/ while DEBUG and yn != 'y': yn = raw_input('This process can be memory-intensive if' 'DEBUG = True in settings as this logs all SQL. ' 'DEBUG is currently True. Please set this to False' 'if you are experiencing issues. Continue (y/n)?') \ .lower().strip() if yn == 'n': return dataset_qs = Dataset.objects.filter(name__exact=str(year)+' Census Tracts') if len(dataset_qs) > 0: ds = dataset_qs[0] ds.cached = datetime.utcnow().replace(tzinfo=utc), else: coor = GeoCoordinates(lat_field='INTPTLAT'+str(year)[-2:], lon_field='INTPTLON'+str(year)[-2:]) coor.save() names = DatasetNameField(field1_en='Land Area', field1_name='ALAND'+str(year)[-2:], field2_en='Water Area', field2_name='AWATER'+str(year)[-2:]) names.save() ds = Dataset(name=str(year)+' Census Tracts', cached=datetime.utcnow().replace(tzinfo=utc), cache_max_age=1000, name_field='NAMELSAD'+str(year)[-2:], coordinates=coor, names=names) if year == 2010: ds.remote_id_field = 'GEOID00' elif year == 2000: ds.remote_id_field = 'CTIDFP00' ds.save() tract_mapping = { 'remote_id': ds.remote_id_field, 'name': ds.name_field, 'lat': ds.coordinates.lat_field, 'lon': ds.coordinates.lon_field, 'field1': ds.names.field1_name, 'field2': ds.names.field2_name, 'mpoly': 'MULTIPOLYGON', } ftp = ftplib.FTP('ftp2.census.gov') ftp.login() ftp.cwd("/geo/tiger/TIGER2010/TRACT/" + str(year) + "/") files = ftp.nlst() MapPolygon.objects.filter(dataset_id__isnull=True).delete() max_state = MapPolygon.objects.filter(dataset_id__exact=ds.id).aggregate(Max('remote_id')) max_state = max_state['remote_id__max'] if max_state is not None: try: max_state = int(max_state)/1000000000 if max_state >= starting_state: starting_state = max_state + 1 except Exception: pass for i in [format(x, '#02d') for x in range(starting_state, 100)]: short_name = 'tl_2010_' + i + '_tract' + str(year)[-2:] tract_shp = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/'+short_name)) if (not os.path.isfile(tract_shp+'.shp') or not os.path.isfile(tract_shp+'.shx') or not os.path.isfile(tract_shp+'.shp.xml') or not os.path.isfile(tract_shp+'.prj') or not os.path.isfile(tract_shp+'.dbf')): if short_name + '.zip' not in files: continue if verbose: print short_name + '.shp does not exist locally.\n\tDownloading from Census FTP...' try: # download the file local_file = open(tract_shp+'.zip', 'wb') ftp.retrbinary('RETR '+short_name+'.zip', local_file.write) local_file.close() # open the zip zipped = zipfile.ZipFile(tract_shp+'.zip') for suffix in ['.shp', '.prj', '.dbf', '.shp.xml', '.shx']: zipped.extract(short_name+suffix, os.path.abspath(os.path.join(os.path.dirname(__file__), 'data'))) except Exception as inst: if verbose: print '\tException:', inst print '\t'+short_name + '.shp did not download or unzip correctly. Moving on...' continue tract_shp = tract_shp + '.shp' if verbose: print '\tBegin layer mapping...' lm = LayerMapping(MapPolygon, tract_shp, tract_mapping, transform=False, encoding='iso-8859-1') while True: try: lm.save(strict=True, verbose=False) # verbose) break # exception part is untested, error didn't happen again except Exception as inst: yn = '' while yn not in ['n', 'y']: yn = raw_input('Error saving: ' + str(inst) + '\nContinue (y/n)?').strip().lower() if yn == 'y': MapPolygon.objects.filter(dataset_id__isnull=True).filter(remote_id__startswith=i).delete() else: break if verbose: print '\tLayer mapping done.' MapPolygon.objects.filter(dataset=None).update(dataset=ds) if verbose: print '\tLayer associated with dataset.' ftp.quit() if verbose: print 'All shapefiles added.'
class NDDataset: """Dataset interface for each cache Dataset""" def __init__(self, dataset_name): """Intialize the dataset""" self.dataset_name = dataset_name self.channel_list = [] self.db = CacheDB() try: self.ds = Dataset.objects.get(dataset_name = dataset_name) except ObjectDoesNotExist as e: self.fetchDataset() self.populateDataset() def fetchDataset (self): """Fetch a dataset to the list of cacheable datasets""" token = self.dataset_name.split('-')[0] try: json_info = json.loads(getURL('http://{}/ocpca/{}/info/'.format(settings.SERVER, token))) except Exception as e: logger.error("Token {} doesn not exist on the backend {}".format(token, settings.SERVER)) raise NDTILECACHEError("Token {} doesn not exist on the backend {}".format(token, settings.SERVER)) ximagesize, yimagesize, zimagesize = json_info['dataset']['imagesize']['0'] xoffset, yoffset, zoffset = json_info['dataset']['offset']['0'] xvoxelres, yvoxelres, zvoxelres = json_info['dataset']['voxelres']['0'] scalinglevels = json_info['dataset']['scalinglevels'] scalingoption = ND_scalingtoint[json_info['dataset']['scaling']] starttime, endtime = json_info['dataset']['timerange'] project_name = json_info['project']['name'] s3backend = json_info['project']['s3backend'] self.ds = Dataset(dataset_name=self.dataset_name, ximagesize=ximagesize, yimagesize=yimagesize, zimagesize=zimagesize, xoffset=xoffset, yoffset=yoffset, zoffset=zoffset, xvoxelres=xvoxelres, yvoxelres=yvoxelres, zvoxelres=zvoxelres, scalingoption=scalingoption, scalinglevels=scalinglevels, starttime=starttime, endtime=endtime, project_name=project_name, s3backend=s3backend) self.ds.save() for channel_name in json_info['channels'].keys(): channel_name = channel_name dataset_id = self.dataset_name channel_type = json_info['channels'][channel_name]['channel_type'] channel_datatype = json_info['channels'][channel_name]['datatype'] startwindow, endwindow = json_info['channels'][channel_name]['windowrange'] propagate = json_info['channels'][channel_name]['propagate'] readonly = json_info['channels'][channel_name]['readonly'] ch = Channel(channel_name=channel_name, dataset=self.ds, channel_type=channel_type, channel_datatype=channel_datatype, startwindow=startwindow, endwindow=endwindow, propagate=propagate, readonly=readonly) ch.save() def populateDataset (self): """Populate a dataset information using the information stored""" self.resolutions = [] self.cubedim = {} self.supercubedim = {} self.imagesz = {} self.offset = {} self.voxelres = {} self.scale = {} self.timerange = [self.ds.starttime, self.ds.endtime] for i in range(self.ds.scalinglevels+1): # add this level to the resolutions self.resolutions.append( i ) # set the image size. the scaled down image rounded up to the nearest cube xpixels = ((self.ds.ximagesize-1)/2**i)+1 ypixels = ((self.ds.yimagesize-1)/2**i)+1 if self.ds.scalingoption == ZSLICES: zpixels = self.ds.zimagesize else: zpixels = ((self.ds.zimagesize-1)/2**i)+1 self.imagesz[i] = [xpixels, ypixels, zpixels] # set the offset xoffseti = 0 if self.ds.xoffset == 0 else ((self.ds.xoffset)/2**i) yoffseti = 0 if self.ds.yoffset == 0 else ((self.ds.yoffset)/2**i) if self.ds.zoffset == 0: zoffseti = 0 else: if self.ds.scalingoption == ZSLICES: zoffseti = self.ds.zoffset else: zoffseti = ((self.ds.zoffset)/2**i) self.offset[i] = [ xoffseti, yoffseti, zoffseti ] # set the voxelresolution xvoxelresi = self.ds.xvoxelres * float(2**i) yvoxelresi = self.ds.yvoxelres * float(2**i) zvoxelresi = self.ds.zvoxelres if self.ds.scalingoption == ZSLICES else self.ds.zvoxelres*float(2**i) self.voxelres[i] = [ xvoxelresi, yvoxelresi, zvoxelresi ] self.scale[i] = { 'xy':xvoxelresi/yvoxelresi , 'yz':zvoxelresi/xvoxelresi, 'xz':zvoxelresi/yvoxelresi } # choose the cubedim as a function of the zscale # this may need to be changed. if self.ds.scalingoption == ZSLICES: if float(self.ds.zvoxelres/self.ds.xvoxelres)/(2**i) > 0.5: self.cubedim[i] = [128, 128, 16] else: self.cubedim[i] = [64, 64, 64] if self.ds.s3backend == S3_TRUE: self.supercubedim[i] = map(mul, self.cubedim[i], SUPERCUBESIZE) else: self.supercubedim[i] = self.cubedim[i] # Make an exception for bock11 data -- just an inconsistency in original ingest if self.ds.ximagesize == 135424 and i == 5: self.cubedim[i] = [128, 128, 16] else: # RB what should we use as a cubedim? self.cubedim[i] = [128, 128, 16] def removeDataset(self): """Remove a dataset""" self.ds.delete() try: shutil.rmtree("{}/{}".format(settings.CACHE_DIR, self.dataset_name)) except Exception as e: logger.error("Failed to remove dataset directories at {}. Error {}. Manual cleanup may be necessary.".format(self.dataset_name, e)) raise NDTILECACHEError("Failed to remove dataset directories at {}. Error {}. Manual cleanup may be necessary.".format(self.dataset_name, e)) # Accessors def getDatasetName(self): return self.ds.dataset_name def getDatasetId(self): return self.ds.dataset_id def getS3Backend(self): return self.ds.s3backend def getImageSize(self, resolution): return self.imagesz[resolution] def getVoxelRes(self, resolution): return self.voxelres[resolution] def getProjectName(self): return self.ds.project_name def getChannelObj(self, channel_name): """Return a channel object""" return NDChannel(channel_name, self.ds)
def main(show_progress, *args, **kwargs): # Create a new fetch index for the records fetched. last_fetch_index = Dataset.select(fn.Max(Dataset.fetch_index)).scalar() or 0 fetch_index = last_fetch_index + 1 # Set up progress bar if show_progress: progress_bar = ProgressBar(widgets=[ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Fetched metadata for ', Counter(), ' datasets.' ]) progress_bar.start() # Fetch all pages of datasets datasets_fetched = 0 last_page = False while not last_page: params = DEFAULT_PARAMS.copy() params['start'] = datasets_fetched resp = make_request(default_requests_session.get, URL, params=params).json() if not resp['success']: logging.error("Request to URL %s was unsuccessful", URL) result = resp['result'] num_datasets = len(result['results']) datasets_fetched += num_datasets if show_progress: # We can finally initialize the total number of datasets expected # only after we get the first round of results. progress_bar.maxval = result['count'] progress_bar.update(datasets_fetched) for dataset in result['results']: dataset_record = Dataset.create( dataset_id=dataset['id'], title=trim_char_data(dataset['title']), license_title=trim_char_data(['license_title']), fetch_index=fetch_index, ) for resource in dataset['resources']: if resource['format'] == DATA_FORMAT: Resource.create( resource_id=resource['id'], dataset=dataset_record, format=resource['format'], url=resource['url'], ) time.sleep(REQUEST_DELAY) # enforce a pause between each fetch to be respectful to API last_page = datasets_fetched >= result['count'] if show_progress: progress_bar.finish()
file_dataset = "df_σ02_350_08Х18Н10Т.json" target_mech = "σ0,2_350" norm_mech = "σ0,2_350_norm" target = "is_defect" with open(file_dataset, 'r') as f: df = pd.DataFrame(json.loads(f.read())) print("Dataset: read is done!") output = defaultdict(list) for thr in tqdm.tqdm([1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1], desc="Thr"): df_train = df.assign(is_defect=lambda row: (row[target_mech] - thr * row[norm_mech] < 0).astype(int)).drop([target_mech, norm_mech], axis=1) share = df_train[target].mean() d = Dataset(data=json.dumps(df_train.select_dtypes(np.number).to_dict('records')), features=df_train.select_dtypes(np.number).drop(target, axis=1).columns, target=target) m = MlModel(model_type='RandomForestClassifier') search_space = OptParams(model_type=type(m.get_model()).__name__) opt = Opt(data=d, params=search_space, pipeline=m, metric=partial(precision_score, zero_division=0), trials=Trials() ) opt.start_opt() output['thr'] += [thr] output['share'] += [share] output['best_trial'] += [opt.trials.best_trial['result']]
def update_model(self): """ Update XGboost model (gbm), using relevant data. This function using model Films and Users, please don't change their. :return: accuracy of the new model """ # load the list of users count = Users.select().count() users = [] for i in range(0, count, 100): usrs = Users.select().offset(i).limit(100).execute() for u in usrs: users.append(model_to_dict(u)) # collect dataset dataset = [] for i in range(0, count, 200): data = Dataset.select().order_by( Dataset.id).offset(i).limit(200).execute() for d in data: dataset.append(model_to_dict(d)) dataset = self.filtr(dataset) dataset = [{ "data": self.full_data(d["film"], d["user"]), "result": d["result"] } for d in dataset] X = [d["data"] for d in dataset] Y = [int(d["result"] > 0) for d in dataset] from sklearn.preprocessing import normalize X = normalize(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42) # learning new model if MODEL == "gbm": model = xgb.XGBClassifier( max_depth=7, n_estimators=1600, learning_rate=0.01, subsample=0.3, # gamma = 300, colsample_bytree=0.3).fit(X_train, y_train) else: pool = Pool(X_train, y_train) model = CatBoostClassifier(iterations=1600, learning_rate=0.01, depth=5, random_seed=7) model.fit(pool) @not_test def save(): # save model import pickle pickle.dump(model, open(PATH_TO_DIR + "model", "wb")) # compute accuracy predictions = model.predict_proba(X_test)[:, 1] from sklearn.metrics import roc_auc_score test = roc_auc_score(predictions > 0.5, y_test) return test
def test_make_FragmentationSpectrum(self): d1 = Dataset(name='Dataset1') d1.save() FragmentationSpectrum(precursor_mz='123.456', spec_num=0, dataset=d1).save() self.assertEqual(FragmentationSpectrum.objects.all().count(), 1)
def main(infile): # LOAD TRAIN SET dataset_train = Dataset.DatasetReview() dataset_train.load_review_from_csv(train_set) # LOAD TEST SET dataset_test = Dataset.DatasetReview() dataset_test.load_review_from_csv(test_set) # preprocessor = DatasetPreprocessor() # dataset = preprocessor.fold_cases_d(dataset) # dataset = preprocessor.remove_punctuations_d(dataset) # dataset = preprocessor.convert_numbers_d(dataset) # dataset.export_only_contents("../Test/dataset.txt") # fe = BagFeatureExtractor(dataset.get_contents()) # fe.build() # fe.save_vocab("../Test/vocab.txt") # dataset.export_formatted_dataset("formatted_dataset_wow.tsv") print "\n**** CROSS VALIDATION EVALUATION (CORPUS: WIKIPEDIA) ****\n" fe = BagFeatureExtractor(dataset_train.get_contents()) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_cross_validation(classifier, fe, dataset_train) fe = TfidfFeatureExtractor(dataset_train.get_contents()) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_cross_validation(classifier, fe, dataset_train) fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), infile=wiki_w2v_model, binary=False, dimen=200) # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_cross_validation(classifier, fe, dataset_train) fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), infile="vectors_full_wow.txt", binary=False, dimen=200) # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_cross_validation(classifier, fe, dataset_train) fe = SennaFeatureExtractor(dataset_train.get_contents(), infile="../senna_vectors.txt", vocabfile="../senna_vocab.txt") classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_cross_validation(classifier, fe, dataset_train) print "\n**** TRAINING SET EVALUATION (CORPUS: WIKIPEDIA) ****\n" fe = BagFeatureExtractor(dataset_train.get_contents()) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_training_set(classifier, fe, dataset_train) fe = TfidfFeatureExtractor(dataset_train.get_contents()) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_training_set(classifier, fe, dataset_train) fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), infile=wiki_w2v_model, binary=False, dimen=200) # fe.save_model_to_file("vectors_full_wow.txt", vocabfile="vocab_full_wow.txt", binary=False) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_training_set(classifier, fe, dataset_train) fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), infile="vectors_full_wow.txt", binary=False, dimen=200) # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_training_set(classifier, fe, dataset_train) fe = SennaFeatureExtractor(dataset_train.get_contents(), infile="../senna_vectors.txt", vocabfile="../senna_vocab.txt") classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_training_set(classifier, fe, dataset_train) print "TEST SET EVALUATION (CORPUS: WIKIPEDIA)" fe = BagFeatureExtractor(dataset_train.get_contents()) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test) fe = TfidfFeatureExtractor(dataset_train.get_contents()) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test) fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), infile=wiki_w2v_model, binary=False, dimen=200) # fe.save_model_to_file("vectors_full_wow.txt", vocabfile="vocab_full_wow.txt", binary=False) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test) fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), infile="vectors_full_wow.txt", binary=False, dimen=200) # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False) classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test) fe = SennaFeatureExtractor(dataset.get_contents(), infile="../senna_vectors.txt", vocabfile="../senna_vocab.txt") classifier = Classifier(models="svm") ev = Evaluator() ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)
def main(): """ Sentiment Specific Embedding for twitter classification """ embeddings_size = 50 # Embedding size for SSWE model vocab_file = "Embedding/features/semeval_vocabs_200.txt" # path to the vocabulary file vector_file = "Embedding/features/semeval_vectors_200.txt" # path to the vector file stopwordsfile = "preprocess/stopwords.txt" """ Sentiment-Specific Word Embedding (SSWE) """ if True: # Load dataset data_train = 'dataset/training1600000.csv' # training data set file path pre_data_train = 'dataset/preprocessed_dataset1600000.csv' # file to save dataset after cleaning if True: print("\n **** Dataset cleaning ****") tweets_prepocess(data_train, pre_data_train, stopwordsfile) if True: print("\n **** SSWE model Trainig ****") train_model = None # path to the file contains the trained model if it is already exist save_model = "Embedding/models/SSWE_model_1600000_200" # path to the file where model will be saved sswe = create_sswe_model(pre_data_train, vocab_file, vector_file, train_model, save_model, embeddings_size) sswe_trainer(sswe) """ Embedding visualisation and Similarity computing """ if True: visualiser = Visualiser( sizeOfEmbedding=embeddings_size, VocabsFname=vocab_file, VectorsFname=vector_file, WVFilename="Visualisation/data/w2vformat.txt", visualizerHTMLfilename="Visualisation/data/embedding.html") visualiser.visualize() """ Twitter Sentiment Classification """ if True: # Data pre-processing print("\n **** Training data cleaning ****") pre_processing_train = "dataset/preprocessed_semeval_traindataset.csv" # tweets_prepocess(train_set, pre_processing_train, stopwordsfile) print("\n **** Test data cleaning ****") pre_processing_test = "dataset/preprocessed_semeval_testdataset.csv" # tweets_prepocess(test_set, pre_processing_test, stopwordsfile) # LOAD TRAIN SET dataset_train = Dataset.DatasetReview() dataset_train.load_review_from_csv(pre_processing_train) # LOAD TEST SET dataset_test = Dataset.DatasetReview() dataset_test.load_review_from_csv(pre_processing_test) ################################### Neural Nets classifier ########################### # Extract Features tweet2v = get_sswe_features(vocab_file, vector_file) # Extract samples and labels x_train, y_train = split_data(dataset_train) x_test, y_test = split_data(dataset_train) tfidf = build_tfidf(x_train) train_vecs_sswe = np.concatenate([ buildWordVector(z.split(), embeddings_size, tweet2v, tfidf) for z in tqdm(map(lambda x: x, x_train)) ]) train_vecs_sswe = scale(train_vecs_sswe) test_vecs_sswe = np.concatenate([ buildWordVector(z.split(), embeddings_size, tweet2v, tfidf) for z in tqdm(map(lambda x: x, x_test)) ]) test_vecs_sswe = scale(test_vecs_sswe) # neural network model neuralnets = NeuralNets(input_size=embeddings_size, x_train=train_vecs_sswe, y_train=y_train, epochs=450, batch_size=32, x_test=test_vecs_sswe, y_test=y_test) neuralnets.train_neural_nets() ########################################################################################## ######## ######## Classical classifiers with sklearn ######## ########################################################################################## print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") fe_sswe = SennaFeatureExtractor(infile=vector_file, vocabfile=vocab_file, dimen=embeddings_size) feature_extractors = [fe_sswe] ev = Evaluator() ################################# SVM ################################################### print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") model = Classifier(models="svm") kfold = KFold(n_splits=10) ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) ev.create_evaluation_result(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n") ev.eval_with_test_set(model, feature_extractors=feature_extractors, training_set=dataset_train, test_set=dataset_test) ################################### Naive bayes ########################################## print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") model = Classifier(models="multinomial") kfold = KFold(n_splits=10) ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) ev.create_evaluation_result(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) print("\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n") ev.eval_with_test_set(model, feature_extractors=feature_extractors, training_set=dataset_train, test_set=dataset_test) ######################################### RandomForestClassifier ####################### print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") model = Classifier(models="rfc") kfold = KFold(n_splits=10) ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) ev.create_evaluation_result(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n") ev.eval_with_test_set(model, feature_extractors=feature_extractors, training_set=dataset_train, test_set=dataset_test) ######################################### MLPClassifier ####################### print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n") model = Classifier(models="nn") kfold = KFold(n_splits=10) ev.eval_with_cross_validation(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) ev.create_evaluation_result(model, feature_extractors=feature_extractors, training_set=dataset_train, num_fold=10, cv=kfold) print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n") ev.eval_with_test_set(model, feature_extractors=feature_extractors, training_set=dataset_train, test_set=dataset_test)