Example #1
0
def main():

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(train_set)

    # LOAD TEST SET
    dataset_test = Dataset.DatasetReview()
    dataset_test.load_review_from_csv(test_set)

    fe_tfidf = TfidfFeatureExtractor(size=500)
    fe_w2v = WordEmbeddingFeatureExtractor(infile=w2v_vec_path, binary=False, dimen=500)
    fe_sswe_w2v = WordEmbeddingFeatureExtractor(infile=sswe_w2v, binary=False, dimen=500, sswe=1)
    fe_sswe = SennaFeatureExtractor(infile=sswe_senna_vectors, vocabfile=sswe_senna_vocabs, dimen=500)

    feature_extractors = [fe_tfidf, fe_w2v, fe_sswe_w2v, fe_sswe]

    ev = Evaluator()

    print "\n**** CROSS VALIDATION EVALUATION (CORPUS: DATASET) ****\n"
    model = Classifier(models="nn")

    kfold = KFold(n_splits=10)
    ev.eval_with_cross_validation(model, feature_extractors=feature_extractors,
                                    training_set=dataset_train, num_fold=10, cv=kfold)

    model = Classifier(models="nn")
    ev.create_evaluation_result(model, feature_extractors=feature_extractors,
                                    training_set=dataset_train, num_fold=10, cv=kfold)

    print "\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n"
    model = Classifier(models="nn")
    ev.eval_with_test_set(model, feature_extractors=feature_extractors,
                            training_set=dataset_train,
                            test_set=dataset_test)
Example #2
0
def run(verbose=True):
    ds = Dataset(name = '2010 Census Tracts',
        cached = datetime.utcnow().replace(tzinfo=utc),
        cache_max_age = 1000,
        remote_id_field = 'GEOID10',
        name_field = 'NAMELSAD10',
        lat_field = 'INTPTLAT10',
        lon_field = 'INTPTLON10',
        field1_en = 'Land Area',
        field1_name = 'ALAND10',
        field2_en = 'Water Area',
        field2_name = 'AWATER10')


    tract_mapping = {
        'remote_id' : ds.remote_id_field,
        'name' : ds.name_field,
        'lat' : ds.lat_field,
        'lon' : ds.lon_field,
        'field1' : ds.field1_name,
        'field2' : ds.field2_name,
        'mpoly' : 'MULTIPOLYGON',
    }

    tract_shp = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/tl_2010_36_tract10.shp'))

    lm = LayerMapping(MapPolygon, tract_shp, tract_mapping, transform=False, encoding='iso-8859-1')

    lm.save(strict=True, verbose=verbose)

    ds.save()

    MapPolygon.objects.filter(dataset = None).update(dataset = ds)
Example #3
0
def api_dataset_index(user, values=None):
    if request.method == 'GET':
        return json.dumps(serializers.user_datasets(user))
    else:
        validation = Dataset.validate(values)
        if validation == True:
            dataset = Dataset.from_values(user=user, values=values)
            return response_success(serializers.dataset(dataset))
        else:
            return response_error(validation)
Example #4
0
def dataset_upload(request):

  user = request.user

  if request.method == 'POST':
    if user.is_authenticated():

      file = request.FILES.get('filename', '')

      file_name = file.name
      dest_dir = os.path.join(settings.USR_DATASET_ROOT, user.username)
      if not os.path.exists(dest_dir):
        os.makedirs(dest_dir)

      full_path = os.path.join(dest_dir, file_name)
      rel_path = os.path.join(user.username, file_name)
      destination = open(full_path, "wb+")
      for chunk in file.chunks():
        destination.write(chunk)
      destination.close()

      description = request.POST['description']
      access = request.POST['access']
      tbl_separator = {"tab":'\t', "space":' ', "comma":',', "semicolon":';'}
      sep_str = request.POST['sep']
      sep = tbl_separator[sep_str]
      header = request.POST['header']
      if header == 'yes':
        header = True;
      elif header == 'no':
        header = False;

      ## a simple check
      size = 0
      for line in open(full_path):
        size += 1
      dim = len(line.split(sep))
      if header == True:
        size -= 1 # exclude the header line

      new_dataset = Dataset(owner = user, path = rel_path, name = file_name, dim = dim, size = size, description = description, access = access, sep = sep_str, header = header)
      new_dataset.save()

      notice = "Congratulations! Your dataset has been successfully uploaded."
      # return render_to_response('dataset/success.html', RequestContext(request, {'dataset': new_dataset, 'notice': notice}))
      return HttpResponseRedirect('/datasets/%s/' % new_dataset.id)

    else:
      notice = "You must be logged in to upload datasets"
      form = UploadDatasetForm()
      return render_to_response('dataset/upload.html', RequestContext(request, {'form': form, 'notice': notice}))

  else:
    form = UploadDatasetForm()
    return render_to_response('dataset/upload.html', RequestContext(request, {'form': form}))
Example #5
0
 def test_is_not_empty_if_annotated(self):
     m1 = Molecule(name='test', sum_formula="C1H2O3")
     m1.save()
     s1 = Standard(molecule=m1)
     s1.save()
     d1 = Dataset()
     d1.save()
     fs1 = FragmentationSpectrum(ms1_intensity=42, dataset=d1, standard=s1)
     fs1.save()
     molecule_table, molecules_with_spectra = self.get_table_and_count()
     self.assertEqual(len(molecule_table.rows), 1)
     self.assertEqual(molecules_with_spectra, 1)
Example #6
0
def example():
    dataset = Dataset()

    meta, data = dataset.at(0, xy=False)
    symbol = Symbol(meta, data)
    symbols = (
        symbol['count'].apply(np.log)
        ==
        symbol[['age', 'smoke']].interact(lambda x: x[0]*x[1], name='age_smoke') +
        symbol['age'] + symbol['smoke'] + symbol['drug'] + symbol['partners'] + symbol['cesd']
    )
    return symbols
Example #7
0
 def test_make_FragmentationSpectrum_with_centroids(self):
     d1 = Dataset(name='Dataset1')
     d1.save()
     f1 = FragmentationSpectrum(precursor_mz='123.456',
                                spec_num=0, dataset=d1)
     mzs = [10., 20, 50]
     ints = [1., 1., 1.]
     f1.set_centroid_mzs(mzs)
     f1.set_centroid_ints(ints)
     f1.save()
     np.testing.assert_array_almost_equal(mzs, f1.centroid_mzs)
     np.testing.assert_array_almost_equal(ints, f1.centroid_ints)
Example #8
0
def get_dataset(datastore, id):
    '''
    Creates a dataset object from the .valid file
    '''
    try:
        datastore.download(id + '/uploads/.valid')
    except AttributeError:
        return None
    with open(id + '/uploads/.valid', 'r') as validfile:
        if validfile.read() == id:
            dataset = Dataset(id)
            dataset.datastore = datastore
            return dataset
Example #9
0
    def test_xic_and_standard_and_adduct(self):
        # create some datasets
        d1 = Dataset(name='Dataset1')
        d1.save()
        a1 = Adduct(nM=1, delta_formula='-H', charge=-1)
        a1.save()
        m1 = Molecule(name='TestMolecule1', sum_formula="C1H2O3")
        m1.save()
        s1 = Standard(molecule=m1, inventory_id="0")
        s1.save()
        # create some xics
        x1 = Xic(mz=60.993, dataset=d1)
        xic = [1.0, 2.0, 3.0, 4.0, 5.0]
        x1.set_xic(xic)
        x1.standard = s1
        x1.adduct = a1
        x1.save()
        self.assertEqual(Xic.objects.all().count(), 1)
        self.assertEqual(Dataset.objects.all().count(), 1)
        self.assertEqual(Standard.objects.all().count(), 1)
        # mass check
        with self.assertRaises(ValueError):
            x1.mz = 123.993
            x1.save()
            x1.check_mass()

        def test_xic_mass_filter(self):
            d1 = Dataset(name='dataset')
            d1.save()
            mz = 60.993
            # three larger
            Xic(mz=mz + 5., dataset=d1).save()
            Xic(mz=mz + 10., dataset=d1).save()
            Xic(mz=mz + 15., dataset=d1).save()
            # three approx equal
            Xic(mz=mz + 0.005, dataset=d1).save()
            Xic(mz=mz + 0.0, dataset=d1).save()
            Xic(mz=mz - 0.0015, dataset=d1).save()
            # three smaller
            Xic(mz=mz - 5., dataset=d1).save()
            Xic(mz=mz - 10., dataset=d1).save()
            Xic(mz=mz - 15., dataset=d1).save()
            # three approx equal from another dataset
            d2 = Dataset(name='dataset2')
            d2.save()
            Xic(mz=mz + 0.005, dataset=d2).save()
            Xic(mz=mz + 0.0, dataset=d2).save()
            Xic(mz=mz - 0.0015, dataset=d2).save()
            self.assertEqual(Xic.objects.all().count(), 12)
            xics = Xic.objects.all().filter(dataset=d1).filter(mz__gte=mz + 0.01).filter(mz__lte=mz - 0.01)
            self.assertEqual(xics.objects.all().count(), 3)
Example #10
0
def get_dataset(datastore, id):
    '''
    Creates a dataset object from the .valid file
    '''
    try:
        valid_path = '{0}/uploads/.valid'.format(id)
        valid_file = datastore.read(valid_path)
    except AttributeError:
        return None

    if valid_file.read() == id:
        dataset = Dataset(id)
        dataset.datastore = datastore
        return dataset
Example #11
0
def get_dataset(datastore, id):
    '''
    Creates a dataset object from the .valid file
    '''
    try:
        valid_path = '{0}/uploads/.valid'.format(id)
        valid_file = datastore.read(valid_path)
    except AttributeError:
        return None

    if valid_file.read() == id:
        dataset = Dataset(id)
        dataset.datastore = datastore
        return dataset
Example #12
0
def dataset():
    print(request.method)
    form = Dataset(request.form)
    #root = Tk()
    #root.filename =  filedialog.asksaveasfilename(initialdir = "/",title = "Select file",filetypes = (("jpeg files","*.jpg"),("all files","*.*")))
    #name = asksaveasfilename()
    #with open(name + '.csv', 'w', newline='') as csvfile:
    #   create = csv.writer(csvfile)
    #  create.writerow(["adam","deboosere"])

    if not form.validate_on_submit():
        return render_template('dataset.html', form=form)
    if request.method == 'POST':
        #return 'Submitted!'
        render_template('dataset.html', form=form)
Example #13
0
def new_dataset():
    '''
    Create a unique url for this dataset to work under
    Create a folder on S3 using this url
    '''
    # Make a new dataset object
    id = str(uuid.uuid4())
    dataset = Dataset(id)
    dataset.datastore = make_datastore(app.config['DATASTORE'])
    
    # Write a verifying file to prove we created these folders
    validname = '{0}/uploads/.valid'.format(dataset.id)
    dataset.datastore.write(validname, StringIO(dataset.id))

    return redirect('/datasets/' + dataset.id)
Example #14
0
def new_dataset():
    '''
    Create a unique url for this dataset to work under
    Create a folder on S3 using this url
    '''
    # Make a new dataset object
    id = str(uuid.uuid4())
    dataset = Dataset(id)
    dataset.datastore = make_datastore(app.config['DATASTORE'])

    # Write a verifying file to prove we created these folders
    validname = '{0}/uploads/.valid'.format(dataset.id)
    dataset.datastore.write(validname, StringIO(dataset.id))

    return redirect('/datasets/' + dataset.id)
Example #15
0
    def add_dataset():
        body = request.get_json()

        name = body.get('name')
        type = body.get('type')
        description = body.get('description')
        provider_id = body.get('provider_id')

        try:
            dataset = Dataset(name, provider_id, type, description)
            dataset.insert()

            return jsonify({'success': True, 'dataset_id': dataset.id})
        except Exception as es:
            print(es)
            abort(422)
Example #16
0
def create_dataset(sessionconfig, params):

    session = sessionconfig[0]
    config = sessionconfig[1]

    checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\
        params(name=params['name']).first()

    if(checkDataset is None):

        dataset = Dataset(name=params['name'], identifier=params['identifier'], description=params['description'], details=params['details'], module_parameters='', created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id='')
        shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH'])

        session.add(dataset)
        session.commit()

        objs = []
        if(config['BACKEND'] == 'hdfs'):
            objs.append((config['MODULES_DIR'] + 'sqlite.db', config['METADATA_LOCAL_PATH']))
        elif(config['BACKEND'] == 'swift'):
            objs.append(('sqlite.db', config['METADATA_LOCAL_PATH']))
        elif(config['BACKEND'] == 'nfs'):
            pass

        saveObjsBackend(objs, config['BACKEND'], config)

    else:
        raise RuntimeError("The dataset with name " + params['name'] + " already exists")
Example #17
0
def create_featureset(sessionconfig, params):

    session = sessionconfig[0]
    config = sessionconfig[1]

    modulename = params['modulename']
    analysisMod = session.query(Analysis).from_statement(text("SELECT * FROM analysis where name=:name")).\
        params(name=modulename).first()

    if(analysisMod):  # Check if the module exists

        # module_id = analysisMod.id
        checkDataset = session.query(Dataset).from_statement(text("SELECT * FROM datasets where name=:name")).\
            params(name=params['name']).first()

        if(checkDataset is None):
            dataset = Dataset(name=params['name'], identifier='', description=params['description'], details=params['details'], module_parameters=params['module_parameters'], created=params['created'], user=params['user'], fileformat="Parquet", filepath=params['filepath'], schema=params['schema'], module_id=analysisMod.id)
            shutil.copyfile(config['METADATA_LOCAL_PATH'], config['BACKUP_METADATA_LOCAL_PATH'])

            session.add(dataset)
            session.commit()

        else:
            raise RuntimeError('The feature set with the name ' + params['name'] + ' already exists')
    else:
        raise RuntimeError('No Such Module')
Example #18
0
def insert_dataset(nex_session, fw, x, parent_dataset_id):

    print "DATASET:", x

    y = Dataset(format_name=x['format_name'],
                display_name=x['display_name'],
                obj_url="/dataset/" + x['format_name'],
                source_id=x['source_id'],
                dbxref_id=x.get('dbxref_id'),
                dbxref_type=x.get('dbxref_type'),
                date_public=x.get('date_public'),
                parent_dataset_id=x.get('parent_dataset_id'),
                assay_id=x.get('assay_id'),
                channel_count=x.get('channel_count'),
                sample_count=x.get('sample_count'),
                is_in_spell=x.get('is_in_spell'),
                is_in_browser=x.get('is_in_browser'),
                description=x.get('description'),
                created_by=CREATED_BY)

    nex_session.add(y)
    nex_session.flush()
    nex_session.refresh(y)

    fw.write("Insert dataset: " + x['display_name'] + " into database\n")

    return y.dataset_id
Example #19
0
  def fetchDataset (self):
    """Fetch a dataset to the list of cacheable datasets"""

    token = self.dataset_name.split('-')[0]
    
    try:
      json_info = json.loads(getURL('http://{}/ocpca/{}/info/'.format(settings.SERVER, token)))
    except Exception as e:
      logger.error("Token {} doesn not exist on the backend {}".format(token, settings.SERVER))
      raise NDTILECACHEError("Token {} doesn not exist on the backend {}".format(token, settings.SERVER))
    
    ximagesize, yimagesize, zimagesize = json_info['dataset']['imagesize']['0']
    xoffset, yoffset, zoffset = json_info['dataset']['offset']['0']
    xvoxelres, yvoxelres, zvoxelres = json_info['dataset']['voxelres']['0']
    scalinglevels = json_info['dataset']['scalinglevels']
    scalingoption = ND_scalingtoint[json_info['dataset']['scaling']]
    starttime, endtime = json_info['dataset']['timerange']
    project_name = json_info['project']['name']
    s3backend = json_info['project']['s3backend']
    
    self.ds = Dataset(dataset_name=self.dataset_name, ximagesize=ximagesize, yimagesize=yimagesize, zimagesize=zimagesize, xoffset=xoffset, yoffset=yoffset, zoffset=zoffset, xvoxelres=xvoxelres, yvoxelres=yvoxelres, zvoxelres=zvoxelres, scalingoption=scalingoption, scalinglevels=scalinglevels, starttime=starttime, endtime=endtime, project_name=project_name, s3backend=s3backend)
    self.ds.save()

    for channel_name in json_info['channels'].keys():
      channel_name = channel_name
      dataset_id = self.dataset_name
      channel_type = json_info['channels'][channel_name]['channel_type']
      channel_datatype = json_info['channels'][channel_name]['datatype']
      startwindow, endwindow = json_info['channels'][channel_name]['windowrange']
      propagate = json_info['channels'][channel_name]['propagate'] 
      readonly = json_info['channels'][channel_name]['readonly']
      ch = Channel(channel_name=channel_name, dataset=self.ds, channel_type=channel_type, channel_datatype=channel_datatype, startwindow=startwindow, endwindow=endwindow, propagate=propagate, readonly=readonly)
      ch.save()
Example #20
0
def datasetr():
    print(request.method)

    form = Dataset(request.form)
    nameMale = request.form["manname"]
    femalName = request.form["womenname"]
    language = request.form["language"]
    time = request.form["time"]
    min_tweet = request.form["min_number"]
    (tweets, y) = getTweets2(int(time), nameMale, femalName, language,
                             int(min_tweet))
    # tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]
    #form.tweet. = 'ada'
    #y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]
    y2 = np.array(y)
    f = np.count_nonzero(y2 == 0)
    h = np.count_nonzero(y2 == 1)
    detail = [
        'Nombre de tweets: ' + str(len(y)), 'Femme: ' + str(f),
        'Homme: ' + str(h)
    ]
    #flash(str(request.form))
    if request.method == 'POST':
        return render_template('datasetresult.html',
                               form=form,
                               tweets=tweets,
                               detail=detail)
Example #21
0
def main(train_set, output_path):

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(train_set)

    # LOAD TEST SET
    # dataset_test = Dataset.DatasetReview()
    # dataset_test.load_review_from_csv(test_set)

    # preprocessor = DatasetPreprocessor()
    # dataset = preprocessor.fold_cases_d(dataset)
    # dataset = preprocessor.remove_punctuations_d(dataset)
    # dataset = preprocessor.convert_numbers_d(dataset)

    # dataset.export_only_contents("../Test/dataset.txt")

    # fe = BagFeatureExtractor(dataset.get_contents())
    # fe.build()
    # fe.save_vocab("../Test/vocab.txt")

    # dataset.export_formatted_dataset("formatted_dataset_wow.tsv")
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(), dimen=300)
    fe.save_model_to_file(output_path + "vectors_full_300.txt",
                          vocabfile=output_path + "vocab_full_300.txt",
                          binary=False)
Example #22
0
 def test_add_xic(self):
     m1 = Molecule(name='TestMolecule1', sum_formula="C1H2O3")
     m1.save()
     s1 = Standard(molecule=m1, inventory_id="0")
     s1.save()
     a1 = Adduct(nM=1, delta_formula='+H+K', charge=-2)
     a1.save()
     d1 = Dataset(name='Dataset1')
     d1.save()
     d1.standards_present.add(s1)
     d1.adducts_present.add(a1)
     x1 = Xic(mz='0.0', dataset=d1)
     xic = [1.0, 2.0, 3.0, 4.0, 5.0]
     x1.set_xic(xic)
     x1.save()
     self.assertEqual(Xic.objects.all().count(), 1)
     np.testing.assert_array_almost_equal(xic, x1.xic)
Example #23
0
    def create_dataset(self, user, name, parent=None):
        if parent:
            parent = self.session.query(Dataset).filter(
                Dataset.name == parent).first()

        path = hashlib.sha256(user.name + name).hexdigest()
        dataset = Dataset(name=name, owner=user, path=path, parent=[parent])
        abspath = os.path.join(self.path, path)
        os.makedirs(abspath)
Example #24
0
def save_form_data(form, file_name):
    """Save the data associated with an uploaded dataset."""
    data_dict = dict((field, form[field]) for field in form)
    # Now delete the unnecessary keys...
    del data_dict['submit'], data_dict['csrf']
    data_dict['file_name'] = file_name
    dataset = Dataset(**data_dict)
    db.session.add(dataset)
    db.session.commit()
Example #25
0
 def setUpTestData(cls):
     d1 = Dataset(name='Dataset1')
     d1.save()
     m1 = Molecule(sum_formula='H2O')
     m1.save()
     m2 = Molecule(sum_formula='O2')
     m2.save()
     s1 = Standard(molecule=m1)
     s1.save()
     s2 = Standard(molecule=m1)
     s2.save()
     s3 = Standard(molecule=m2)
     s3.save()
     FragmentationSpectrum.objects.create(precursor_mz='123.456', spec_num=0, dataset=d1, standard=s1)
     FragmentationSpectrum.objects.create(precursor_mz='123.45', spec_num=0, dataset=d1, standard=s2)
     FragmentationSpectrum.objects.create(precursor_mz='123.4', spec_num=0, dataset=d1, standard=s3)
     cls.m_onespectrum = m2
     cls.m_twospectra = m1
Example #26
0
def initvalidateddataset():
    lines = list(open(data_validated_csv).readlines())

    for line in lines:
        audio_path,lenght,text = line.split(',')
        text = text.replace('\n','')
        new_data= Dataset()
        new_data.text = text
        new_data.audio_lenght = lenght
        new_data.file_path= audio_path
        new_data.file_with_user = 0 # 1 if user validating this instance
        new_data.instance_validated = 1 #1 if human validated this instance
        new_data.instance_valid = 1 # 1 if instance is ok
        new_data.user_validated = 'edresson'
        db.session.add(new_data)
    db.session.commit()
Example #27
0
    def get(self):

        # Check if datasets are loaded in datastore

        # Items in datastore
        d = Dataset.query().count()
        # Items in CDB
        q = "select count(*) as c from resource_staging" + \
            " where ipt is true and networks like '%VertNet%';"
        c = cartodb_query(q)[0]['c']

        # Number of reports stored in the datastore
        num_reports = Report.query().count()

        periods = Period.query()
        num_periods = periods.count()

        periods_done = Period.query(Period.status == "done")
        num_periods_done = periods_done.count()

        periods_progress = Period.query(Period.status == "in progress")
        num_periods_progress = periods_progress.count()

        periods_failed = Period.query(Period.status == "failed")
        num_periods_failed = periods_failed.count()

        resp = {
            "Datastore integrity": [
                {"Datasets in CartoDB": c},
                {"Datasets in the Datastore": d}
            ],
            "Report periods": [
                {"Stored periods": num_periods},
                {"Stored reports": num_reports},
                {"Periods completed": num_periods_done},
                {"Periods in progress": num_periods_progress},
                {"Periods failed": num_periods_failed},
            ]
        }

        if c != d or c == 0:
            dataset_setup_url = "http://%s/setup_datasets" % _HOSTNAME
            resp["Datastore integrity"].append({"URL for dataset setup": dataset_setup_url})
        if num_periods > 0:
            links_to_periods = ["http://%s/status/period/%s" % (_HOSTNAME, x.key.id()) for x in periods.fetch()]
            resp["Report periods"].append({"Links to periods": links_to_periods})
        if num_periods_done > 0:
            resp['Report periods'].append({'List of periods done': [x.period.strftime("%Y-%m") for x in periods_done.fetch()]})
        if num_periods_progress > 0:
            resp['Report periods'].append({'List of periods in progress': [x.period.strftime("%Y-%m") for x in periods_progress.fetch()]})
        if num_periods_failed > 0:
            resp['Report periods'].append({'List of periods failed': [x.period.strftime("%Y-%m") for x in periods_failed.fetch()]})

        self.response.headers['content-type'] = "application/json"
        self.response.write(json.dumps(resp))
Example #28
0
 def test_add_dataset(self):
     # create standards
     m1 = Molecule(name='TestMolecule1', sum_formula="C1H2O3")
     m1.save()
     m2 = Molecule(name='TestMolecule1', sum_formula="C2H2O3")
     m2.save()
     s1 = Standard(molecule=m1, inventory_id="0")
     s1.save()
     s2 = Standard(molecule=m2, inventory_id="1")
     s2.save()
     # create adduct
     a1 = Adduct(nM=1, delta_formula='+H+K', charge=-2)
     a1.save()
     # create a dataset
     d1 = Dataset(name='Dataset1')
     d1.save()
     d1.standards_present.add(s1)
     d1.standards_present.add(s2)
     d1.adducts_present.add(a1)
     self.assertEqual(Dataset.objects.all().count(), 1)
     self.assertEqual(Dataset.objects.all()[0].standards_present.count(), 2)
Example #29
0
def savecsv():
    print(request.method)

    form = Dataset(request.form)

    if 'tweets' in request.form:
        tweets = request.form['tweets']
        tweetsR = eval(tweets)
        #return str(tweetsR[0])
        #name = asksaveasfilename()

        if request.form['typef'] == 'json':
            root = Tk()
            root.filename = filedialog.asksaveasfilename(
                initialdir="/",
                title="Destination du Json",
                filetypes=(("JSON", "*.json"), ("all files", "*.*")))
            #root.mainloop()
            root.destroy()
            with open(root.filename + '.json', 'w') as out_f:
                json.dump(tweetsR, out_f)
            out_f.close()
        else:
            root = Tk()
            root.filename = filedialog.asksaveasfilename(
                initialdir="/",
                title="Destination du dataset",
                filetypes=(("xlsx", "*.xlsx"), ("all files", "*.*")))
            #root.mainloop()
            root.destroy()
            workbook = xlsxwriter.Workbook(root.filename + '.xlsx')
            worksheet = workbook.add_worksheet()

            rowEx = 1
            worksheet.write(0, 0, 'text')
            worksheet.write(0, 1, 'gender')
            for tweet in tweetsR:
                worksheet.write(rowEx, 0, tweet[0])
                worksheet.write(rowEx, 1, tweet[1])
                rowEx += 1

            workbook.close()
        #with open(name + '.csv', 'w', newline='') as csvfile:
        #   create = csv.writer(csvfile)
        #  create.writerow(tweets)

# tweets = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]
#form.tweet. = 'ada'
#y = ["Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies", "Bonjour la famillle ca va !?", "Adaaam ca fait una bail", "oh la la les homme tous les meme", "Bonjouuuuuuuuuuuuuuuuuuuur mes amies"]

#flash(str(request.form))
    if request.method == 'POST':
        return render_template('datasetresult.html', form=form, tweets=tweetsR)
Example #30
0
 def test_xic_mass_filter(self):
     d1 = Dataset(name='dataset')
     d1.save()
     mz = 60.993
     # three larger
     Xic(mz=mz + 5., dataset=d1).save()
     Xic(mz=mz + 10., dataset=d1).save()
     Xic(mz=mz + 15., dataset=d1).save()
     # three approx equal
     Xic(mz=mz + 0.005, dataset=d1).save()
     Xic(mz=mz + 0.0, dataset=d1).save()
     Xic(mz=mz - 0.0015, dataset=d1).save()
     # three smaller
     Xic(mz=mz - 5., dataset=d1).save()
     Xic(mz=mz - 10., dataset=d1).save()
     Xic(mz=mz - 15., dataset=d1).save()
     # three approx equal from another dataset
     d2 = Dataset(name='dataset2')
     d2.save()
     Xic(mz=mz + 0.005, dataset=d2).save()
     Xic(mz=mz + 0.0, dataset=d2).save()
     Xic(mz=mz - 0.0015, dataset=d2).save()
     self.assertEqual(Xic.objects.all().count(), 12)
     xics = Xic.objects.all().filter(dataset=d1).filter(mz__gte=mz + 0.01).filter(mz__lte=mz - 0.01)
     self.assertEqual(xics.objects.all().count(), 3)
Example #31
0
def main(infile):

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(infile)

    dataset_train.export_only_contents("sentences_new.txt")

    # fe = BagFeatureExtractor(dataset.get_contents())
    # fe.build()
    # fe.save_vocab("../Test/vocab.txt")

    dataset_train.export_formatted_dataset("formatted_dataset_wow.tsv")
Example #32
0
def init_db():
    # import all modules here that might define models so that
    # they will be registered properly on the metadata.  Otherwise
    # you will have to import them first before calling init_db()
    from models import Department, User, Role, Dataset
    Base.metadata.drop_all(bind=engine)
    Base.metadata.create_all(bind=engine)

    # Create the fixtures
    engineering = Department(name='Engineering')
    db_session.add(engineering)
    hr = Department(name='Human Resources')
    db_session.add(hr)

    manager = Role(name='manager')
    db_session.add(manager)
    engineer = Role(name='engineer')
    db_session.add(engineer)

    peter = User(name='Peter', department=engineering, role=engineer)
    db_session.add(peter)
    roy = User(name='Roy', department=engineering, role=engineer)
    db_session.add(roy)
    tracy = User(name='Tracy', department=hr, role=manager)
    db_session.add(tracy)

    # Dataset
    import random
    from random import randint
    from faker import Faker
    fake = Faker('en_US')
    nPoints = 11

    # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] }
    data = {
        'x': [int(i) for i in range(nPoints)],
        'z': [float(i) for i in range(nPoints)],
        'names': [fake.name() for i in range(nPoints)]
    }
    test_data1 = Dataset(name='dataset1',
                         description='First dataset',
                         table_name='data1',
                         enabled=True,
                         raw=data)
    db_session.add(test_data1)

    # data = {'x': [randint(0, 1000) for i in range(nPoints)], 'z': [float(random.randrange(0, 1000))/100 for i in range(nPoints)], 'names': [fake.name() for i in range(nPoints)] }
    # test_data2 = Dataset(name='dataset2', description='Second dataset', table_name='data2', enabled=False, raw=data)
    # db_session.add(test_data2)

    db_session.commit()
Example #33
0
def example_1_2_6():
    # 数据预处理
    meta, data= Dataset().at(4, xy=False)
    mapper = collections.Counter(data['id'])
    for id in mapper:
        index = data['id'] == id
        data.loc[index, 'weight'] = data.loc[index, 'weight'].mean()
    # 线性模型的符号表示
    symbol = Symbol(meta, data)
    symbols = (
        symbol['weight']
        ==
        symbol['gender'] + symbol['dose'] +
        symbol['id'].apply(lambda xs: [mapper[x] for x in xs], 'size')
    )
    return symbols
Example #34
0
    def freeze_dataset(self, id_or_name):
        """
        Crea una imagen temporal del contenido de un dataset.

        Args:
        ====
            - id_or_name:
                - str().
                - Id o Nombre del dataset que deseo freezar.
        Returns:
        =======
            - Dataset: Si el objeto es localizable & "Freezable".

        Exceptions:
        ==========
            - ValueError:
                - id_or_name esta unicode o str pero es del len == 0.
            - TypeError:
                - id_or_name no es un str o unicode.
        """
        from models import Dataset
        stored_dataset = self.retrieve_dataset_metadata(id_or_name)
        if stored_dataset:
            freezed_dataset = {
                "license_title": stored_dataset['license_title'],
                "maintainer": stored_dataset['maintainer'],
                "private": stored_dataset['private'],
                "maintainer_email": stored_dataset['maintainer_email'],
                "id": stored_dataset['id'],
                "owner_org": stored_dataset['owner_org'],
                "author": stored_dataset['author'],
                "isopen": stored_dataset['isopen'],
                "author_email": stored_dataset['author_email'],
                "state": stored_dataset['state'],
                "license_id": stored_dataset['license_id'],
                "type": stored_dataset['type'],
                "groups": [g['name'] for g in stored_dataset['groups']],
                "creator_user_id": stored_dataset['creator_user_id'],
                "name": stored_dataset['name'],
                "url": stored_dataset['url'],
                "notes": stored_dataset['notes'],
                "title": stored_dataset['title'],
                "license_url": stored_dataset['license_url']
            }
            return Dataset(datadict=freezed_dataset,
                           _distribution_literal=True,
                           _distributions=stored_dataset['resources'])
Example #35
0
    def diff_datasets(dataset_a=None, dataset_b=None):
        """
        Compara dos datasets y retorna la diferencia aditiva de ambos.

        Cuando se realiza la diferencia, el valor que prevalece es el de
        dataset_b.

        Args:
        ====
            - dataset_a:
                - Dataset().
                - Solo admite ser de tipo Dataset().

            - dataset_b:
                - Dataset().
                - Solo admite ser de tipo Dataset().

        Returns:
        =======
            - Dataset().

        Exceptions:
        ==========
            TypeError:
                - Uno o ambos argumentos, no son de clase Dataset.
        """
        from models import Dataset
        for v in [dataset_a, dataset_b]:
            if not isinstance(v, Dataset):
                raise TypeError(
                    'Para comparar los datasets ambos deben ser de clase Dataset.'
                )
        diff_ds = {}
        omit_this_keys = ['required_keys', 'context']
        for k, v in dataset_a.__dict__.items():
            if k not in omit_this_keys:
                if v != dataset_b.__dict__[k]:
                    diff_ds.update({
                        k:
                        dataset_b.__dict__[k]
                        if len(dataset_b.__dict__[k]) > 0 else v
                    })
                else:
                    diff_ds.update({k: v})
        return Dataset(datadict=diff_ds,
                       _distributions=dataset_a.__dict__['resources'],
                       _distribution_literal=True)
Example #36
0
def add_testdata_to_db(dataset, items, datatype):
    count = db.session.query(
        Dataset, Dataset.name).filter(Dataset.name == dataset).all()
    if len(count) > 0:
        return 'exists'

    new_dataset = Dataset(name=dataset, datatype=datatype)
    for item in items:
        testitem = Item(
            dataset_name=dataset,
            item=json.dumps(item),
            status='available',
            timestamp=datetime.now(),
        )
        new_dataset.items.append(testitem)
        db.session.add(testitem)
    db.session.add(new_dataset)
    db.session.commit()
    return 'added'
Example #37
0
def datasets():
	if not current_user.is_authenticated:
		return redirect(url_for('no_access'))

	datasets = Dataset.query.filter(Dataset.user_id == current_user.id).all()

	for ds in datasets:
		ds.distinctive_name = ds.distinctive_name or ds.filename
		if ds.distinctive_name == ds.filename:
			ds.display_filename = ''
		else: 
			ds.display_filename = ds.filename
			
	model = {
		'title': 'Datasets',
		'datasets': datasets
	}
	form = FileUploadForm()
	if form.validate_on_submit():

		dsFile = form.fileName.data

		separator = form.separator.data
		distinctive_name = form.distinctive_name.data

		filename = secure_filename(dsFile.filename)
		guid = str(uuid.uuid4())
		
		dsFile.seek(0)
		dt = dsFile.read()
		
		dbDs = Dataset(filename, guid, g.user, datetime.datetime.utcnow(), separator, distinctive_name, dt)
		
		db.session.add(dbDs)
		db.session.commit()
		return redirect(url_for('datasets'))
	
	model['form'] = form

	return render_template('datasets.html', model = model)
Example #38
0
    def post(self):
        urlfetch.set_default_fetch_deadline(60)
        self.response.headers['Content-Type'] = 'application/json'

        q = "select gbifdatasetid, icode, orgname, github_orgname, " \
            "source_url, github_reponame, url, gbifpublisherid " \
            "from resource_staging " \
            "where ipt=true and networks like '%VertNet%'"
        resources = carto_query(q)

        ds = []
        for resource in resources:
            ds.append(Dataset(id=resource['gbifdatasetid'], **resource))

        keys = ndb.put_multi(ds)

        result = {
            "datasets processed": len(keys),
            "message": "success"
        }

        self.response.write(json.dumps(result))
        return
Example #39
0
def dataset_upload(request):
    if request.method == 'POST':
        form = UploadFileForm(request.POST, request.FILES)
        if form.is_valid():
            post_dict = dict(request.POST)
            files_dict = dict(request.FILES)
            logging.debug(files_dict)
            logging.debug(post_dict)
            data = {"adducts": post_dict['adducts'],
                    "standards": post_dict['standards'],
                    "mass_accuracy_ppm": post_dict['mass_accuracy_ppm'][0],
                    "quad_window_mz": post_dict['quad_window_mz'][0],
                    "lc_info": post_dict['lc_info'][0],
                    "ms_info": post_dict['ms_info'][0]}
            uploaded_file = request.FILES['mzml_file']
            base_name, extension = os.path.splitext(uploaded_file.name)
            d = Dataset(name=uploaded_file.name, processing_finished=False)
            d.save()
            mzml_filename = "{}-{}{}".format(base_name, d.id, extension)
            mzml_filepath = os.path.join(settings.MEDIA_ROOT, mzml_filename)
            logging.debug("mzML filepath: " + mzml_filepath)
            logging.debug("original mzML filename: " + uploaded_file.name)
            with open(mzml_filepath, 'w') as destination:
                for chunk in uploaded_file.chunks():
                    destination.write(chunk)
            d.path = mzml_filepath
            d.save()
            tasks.handle_uploaded_files.delay(data, mzml_filepath, d)
            return redirect('dataset-list')
    else:
        form = UploadFileForm(initial={"mass_accuracy_ppm": 10.0, 'quad_window_mz': 1.0})
    autocomplete = {
        'lc_info': [str(info.content) for info in LcInfo.objects.all()],
        'ms_info': [str(info.content) for info in MsInfo.objects.all()],
    }
    return render(request, 'mcf_standards_browse/dataset_upload.html', {'form': form, 'autocomplete': autocomplete})
Example #40
0
def hazardous_waste(year=2011, verbose=True):
    try:
        dataset = Dataset.objects.get(name="Hazardous Waste Sites "+str(year))
        dataset.cached = datetime.utcnow().replace(tzinfo=utc)
    except ObjectDoesNotExist:
        coor = GeoCoordinates(lat_field="Latitude",
                              lon_field="Longitude")
        coor.save()
        names = DatasetNameField(field1_en="Generator Status",
                                 field1_name="Generator Status",
                                 field2_en="Biennial Report Link",
                                 field2_name="Biennial Report Link")
        names.save()
        location = Location(street_field="Address",
                            city_field="City",
                            state_field="State",
                            zipcode_field="ZIP Code",
                            county_field="County")
        dataset = Dataset(
            name="Hazardous Waste Sites "+str(year),
            url='/data/ej/'+str(year)+'/',
            cached=datetime.utcnow().replace(tzinfo=utc),
            cache_max_age=1000,
            remote_id_field="Handler ID",
            name_field="Handler Name",
            location=location,
            coordinates=coor,
            names=names
            needs_geocoding=False)
    dataset.save()

    MapPoint.objects.filter(dataset=dataset).delete()

    for state in ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE',
                  'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA',
                  'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN',
                  'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM',
                  'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
                  'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA',
                  'WV', 'WI', 'WY']:
        short_name = 'Envirofacts_Biennial_Report_Search ' + state + '.CSV'
        path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name))
        if not os.path.isfile(path):
            if verbose:
                print 'No file %s exists.' % (short_name)
            short_name = str(year)+' '+state+'.CSV'
            path = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data/ej/'+str(year)+'/'+short_name))
            if not os.path.isfile(path):
                if verbose:
                    print 'No file %s exists.' % (short_name)
                continue
        if verbose:
            print 'Opening file %s' % (short_name)
        readfile = csv.reader(open(path, 'rb'))
        # verify
        row = readfile.next()
        locs = {}
        for i in range(len(row)):
            if row[i] == dataset.remote_id_field:
                locs['remote_id'] = i
            elif row[i] == dataset.name_field:
                locs['name'] = i
            elif row[i] == dataset.location.street_field:
                locs['street'] = i
            elif row[i] == dataset.location.city_field:
                locs['city'] = i
            elif row[i] == dataset.location.state_field:
                locs['state'] = i
            elif row[i] == dataset.location.zipcode_field:
                locs['zipcode'] = i
            elif row[i] == dataset.location.county_field:
                locs['county'] = i
            elif row[i] == dataset.coordinates.lat_field:
                locs['lat'] = i
            elif row[i] == dataset.coordinates.lon_field:
                locs['lon'] = i
            elif row[i] == dataset.names.field1_name:
                locs['field1'] = i
            elif row[i] == dataset.names.field2_name:
                locs['field2'] = i
        for row in readfile:
            kwargs = {'dataset': dataset}
            for key in locs:
                if key in ['lat', 'lon']:
                    try:
                        kwargs[key] = float(row[locs[key]])
                    except Exception:
                        kwargs[key] = 0.
                elif MapPoint._meta.get_field(key).max_length < len(row[locs[key]]):
                    kwargs[key] = row[locs[key]][:MapPoint._meta.get_field(key).max_length]
                else:
                    kwargs[key] = row[locs[key]]
            try:
                kwargs['point'] = Point(kwargs['lon'], kwargs['lat'])
            except Exception:
                if verbose:
                    print '\tInvalid lat/long for row: %s' % (row)
                    print '\tLat: %f Lon: %f' % (kwargs['lat'], kwargs['lon'])
                continue
            mp = MapPoint(**kwargs)
            mp.save()
        if verbose:
            print 'File "%s" done processing' % (short_name)
Example #41
0
def run(verbose=True, year=2010, starting_state=1):
    yn = ''
    # https://docs.djangoproject.com/en/1.7/ref/contrib/gis/layermapping/
    while DEBUG and yn != 'y':
        yn = raw_input('This process can be memory-intensive if'
                       'DEBUG = True in settings as this logs all SQL. '
                       'DEBUG is currently True. Please set this to False'
                       'if you are experiencing issues. Continue (y/n)?') \
                       .lower().strip()
        if yn == 'n':
            return
    dataset_qs = Dataset.objects.filter(name__exact=str(year)+' Census Tracts')
    if len(dataset_qs) > 0:
        ds = dataset_qs[0]
        ds.cached = datetime.utcnow().replace(tzinfo=utc),
    else:
        coor = GeoCoordinates(lat_field='INTPTLAT'+str(year)[-2:],
                              lon_field='INTPTLON'+str(year)[-2:])
        coor.save()
        names = DatasetNameField(field1_en='Land Area',
                                 field1_name='ALAND'+str(year)[-2:],
                                 field2_en='Water Area',
                                 field2_name='AWATER'+str(year)[-2:])
        names.save()
        ds = Dataset(name=str(year)+' Census Tracts',
                     cached=datetime.utcnow().replace(tzinfo=utc),
                     cache_max_age=1000,
                     name_field='NAMELSAD'+str(year)[-2:],
                     coordinates=coor,
                     names=names)
        if year == 2010:
            ds.remote_id_field = 'GEOID00'
        elif year == 2000:
            ds.remote_id_field = 'CTIDFP00'
        ds.save()

    tract_mapping = {
        'remote_id': ds.remote_id_field,
        'name': ds.name_field,
        'lat': ds.coordinates.lat_field,
        'lon': ds.coordinates.lon_field,
        'field1': ds.names.field1_name,
        'field2': ds.names.field2_name,
        'mpoly': 'MULTIPOLYGON',
    }

    ftp = ftplib.FTP('ftp2.census.gov')
    ftp.login()
    ftp.cwd("/geo/tiger/TIGER2010/TRACT/" + str(year) + "/")
    files = ftp.nlst()

    MapPolygon.objects.filter(dataset_id__isnull=True).delete()
    max_state = MapPolygon.objects.filter(dataset_id__exact=ds.id).aggregate(Max('remote_id'))
    max_state = max_state['remote_id__max']
    if max_state is not None:
        try:
            max_state = int(max_state)/1000000000
            if max_state >= starting_state:
                starting_state = max_state + 1
        except Exception:
            pass

    for i in [format(x, '#02d') for x in range(starting_state, 100)]:
        short_name = 'tl_2010_' + i + '_tract' + str(year)[-2:]
        tract_shp = os.path.abspath(os.path.join(os.path.dirname(__file__),
                                    'data/'+short_name))
        if (not os.path.isfile(tract_shp+'.shp')
            or not os.path.isfile(tract_shp+'.shx')
            or not os.path.isfile(tract_shp+'.shp.xml')
            or not os.path.isfile(tract_shp+'.prj')
            or not os.path.isfile(tract_shp+'.dbf')):

            if short_name + '.zip' not in files:
                continue
            if verbose:
                print short_name + '.shp does not exist locally.\n\tDownloading from Census FTP...'
            try:
                # download the file
                local_file = open(tract_shp+'.zip', 'wb')
                ftp.retrbinary('RETR '+short_name+'.zip', local_file.write)
                local_file.close()
                # open the zip
                zipped = zipfile.ZipFile(tract_shp+'.zip')
                for suffix in ['.shp', '.prj', '.dbf', '.shp.xml', '.shx']:
                    zipped.extract(short_name+suffix, os.path.abspath(os.path.join(os.path.dirname(__file__), 'data')))
            except Exception as inst:
                if verbose:
                    print '\tException:', inst
                    print '\t'+short_name + '.shp did not download or unzip correctly. Moving on...'
                continue
        tract_shp = tract_shp + '.shp'
        if verbose:
            print '\tBegin layer mapping...'
        lm = LayerMapping(MapPolygon, tract_shp, tract_mapping, transform=False, encoding='iso-8859-1')

        while True:
            try:
                lm.save(strict=True, verbose=False)  # verbose)
                break
            # exception part is untested, error didn't happen again
            except Exception as inst:
                yn = ''
                while yn not in ['n', 'y']:
                    yn = raw_input('Error saving: ' + str(inst) + '\nContinue (y/n)?').strip().lower()
                if yn == 'y':
                    MapPolygon.objects.filter(dataset_id__isnull=True).filter(remote_id__startswith=i).delete()
                else:
                    break
        if verbose:
            print '\tLayer mapping done.'
        MapPolygon.objects.filter(dataset=None).update(dataset=ds)
        if verbose:
            print '\tLayer associated with dataset.'
    ftp.quit()

    if verbose:
        print 'All shapefiles added.'
Example #42
0
class NDDataset:
  """Dataset interface for each cache Dataset"""

  def __init__(self, dataset_name):
    """Intialize the dataset"""
    
    self.dataset_name = dataset_name
    self.channel_list = []
    self.db = CacheDB()

    try:
      self.ds = Dataset.objects.get(dataset_name = dataset_name)
    except ObjectDoesNotExist as e:
      self.fetchDataset()

    self.populateDataset()
  

  def fetchDataset (self):
    """Fetch a dataset to the list of cacheable datasets"""

    token = self.dataset_name.split('-')[0]
    
    try:
      json_info = json.loads(getURL('http://{}/ocpca/{}/info/'.format(settings.SERVER, token)))
    except Exception as e:
      logger.error("Token {} doesn not exist on the backend {}".format(token, settings.SERVER))
      raise NDTILECACHEError("Token {} doesn not exist on the backend {}".format(token, settings.SERVER))
    
    ximagesize, yimagesize, zimagesize = json_info['dataset']['imagesize']['0']
    xoffset, yoffset, zoffset = json_info['dataset']['offset']['0']
    xvoxelres, yvoxelres, zvoxelres = json_info['dataset']['voxelres']['0']
    scalinglevels = json_info['dataset']['scalinglevels']
    scalingoption = ND_scalingtoint[json_info['dataset']['scaling']]
    starttime, endtime = json_info['dataset']['timerange']
    project_name = json_info['project']['name']
    s3backend = json_info['project']['s3backend']
    
    self.ds = Dataset(dataset_name=self.dataset_name, ximagesize=ximagesize, yimagesize=yimagesize, zimagesize=zimagesize, xoffset=xoffset, yoffset=yoffset, zoffset=zoffset, xvoxelres=xvoxelres, yvoxelres=yvoxelres, zvoxelres=zvoxelres, scalingoption=scalingoption, scalinglevels=scalinglevels, starttime=starttime, endtime=endtime, project_name=project_name, s3backend=s3backend)
    self.ds.save()

    for channel_name in json_info['channels'].keys():
      channel_name = channel_name
      dataset_id = self.dataset_name
      channel_type = json_info['channels'][channel_name]['channel_type']
      channel_datatype = json_info['channels'][channel_name]['datatype']
      startwindow, endwindow = json_info['channels'][channel_name]['windowrange']
      propagate = json_info['channels'][channel_name]['propagate'] 
      readonly = json_info['channels'][channel_name]['readonly']
      ch = Channel(channel_name=channel_name, dataset=self.ds, channel_type=channel_type, channel_datatype=channel_datatype, startwindow=startwindow, endwindow=endwindow, propagate=propagate, readonly=readonly)
      ch.save()


  def populateDataset (self):
    """Populate a dataset information using the information stored"""

    self.resolutions = []
    self.cubedim = {}
    self.supercubedim = {}
    self.imagesz = {}
    self.offset = {}
    self.voxelres = {}
    self.scale = {}
    self.timerange = [self.ds.starttime, self.ds.endtime]

    for i in range(self.ds.scalinglevels+1):

      # add this level to the resolutions
      self.resolutions.append( i )

      # set the image size. the scaled down image rounded up to the nearest cube
      xpixels = ((self.ds.ximagesize-1)/2**i)+1
      ypixels = ((self.ds.yimagesize-1)/2**i)+1
      if self.ds.scalingoption == ZSLICES:
        zpixels = self.ds.zimagesize
      else:
        zpixels = ((self.ds.zimagesize-1)/2**i)+1
      
      self.imagesz[i] = [xpixels, ypixels, zpixels]

      # set the offset
      xoffseti = 0 if self.ds.xoffset == 0 else ((self.ds.xoffset)/2**i)
      yoffseti = 0 if self.ds.yoffset == 0 else ((self.ds.yoffset)/2**i)
      if self.ds.zoffset == 0:
        zoffseti = 0
      else:
        if self.ds.scalingoption == ZSLICES:
          zoffseti = self.ds.zoffset
        else:
          zoffseti = ((self.ds.zoffset)/2**i)
      
      self.offset[i] = [ xoffseti, yoffseti, zoffseti ]

      # set the voxelresolution
      xvoxelresi = self.ds.xvoxelres * float(2**i)
      yvoxelresi = self.ds.yvoxelres * float(2**i)
      zvoxelresi = self.ds.zvoxelres if self.ds.scalingoption == ZSLICES else self.ds.zvoxelres*float(2**i)
            
      self.voxelres[i] = [ xvoxelresi, yvoxelresi, zvoxelresi ]
      self.scale[i] = { 'xy':xvoxelresi/yvoxelresi , 'yz':zvoxelresi/xvoxelresi, 'xz':zvoxelresi/yvoxelresi }
       
      # choose the cubedim as a function of the zscale
      #  this may need to be changed.  
      if self.ds.scalingoption == ZSLICES:
        if float(self.ds.zvoxelres/self.ds.xvoxelres)/(2**i) >  0.5:
          self.cubedim[i] = [128, 128, 16]
        else:
          self.cubedim[i] = [64, 64, 64]
      
      if self.ds.s3backend == S3_TRUE:
        self.supercubedim[i] = map(mul, self.cubedim[i], SUPERCUBESIZE)
      else:
        self.supercubedim[i] = self.cubedim[i]

      # Make an exception for bock11 data -- just an inconsistency in original ingest
      if self.ds.ximagesize == 135424 and i == 5:
        self.cubedim[i] = [128, 128, 16]
      else:
        # RB what should we use as a cubedim?
        self.cubedim[i] = [128, 128, 16]

  def removeDataset(self):
    """Remove a dataset"""
    
    self.ds.delete()
    
    try:
      shutil.rmtree("{}/{}".format(settings.CACHE_DIR, self.dataset_name))
    except Exception as e:
      logger.error("Failed to remove dataset directories at {}. Error {}. Manual cleanup may be necessary.".format(self.dataset_name, e))
      raise NDTILECACHEError("Failed to remove dataset directories at {}. Error {}. Manual cleanup may be necessary.".format(self.dataset_name, e))
  
  # Accessors
  def getDatasetName(self):
    return self.ds.dataset_name
  
  def getDatasetId(self):
    return self.ds.dataset_id

  def getS3Backend(self):
    return self.ds.s3backend

  def getImageSize(self, resolution):
    return self.imagesz[resolution]

  def getVoxelRes(self, resolution):
    return self.voxelres[resolution]

  def getProjectName(self):
    return self.ds.project_name
  
  def getChannelObj(self, channel_name):
    """Return a channel object"""
    
    return NDChannel(channel_name, self.ds)
def main(show_progress, *args, **kwargs):

    # Create a new fetch index for the records fetched.
    last_fetch_index = Dataset.select(fn.Max(Dataset.fetch_index)).scalar() or 0
    fetch_index = last_fetch_index + 1

    # Set up progress bar
    if show_progress:
        progress_bar = ProgressBar(widgets=[
            'Progress: ', Percentage(),
            ' ', Bar(marker=RotatingMarker()),
            ' ', ETA(),
            ' Fetched metadata for ', Counter(), ' datasets.'
        ])
        progress_bar.start()

    # Fetch all pages of datasets
    datasets_fetched = 0
    last_page = False
    while not last_page:

        params = DEFAULT_PARAMS.copy()
        params['start'] = datasets_fetched
        resp = make_request(default_requests_session.get, URL, params=params).json()

        if not resp['success']:
            logging.error("Request to URL %s was unsuccessful", URL)

        result = resp['result']
        num_datasets = len(result['results'])
        datasets_fetched += num_datasets

        if show_progress:
            # We can finally initialize the total number of datasets expected
            # only after we get the first round of results.
            progress_bar.maxval = result['count']
            progress_bar.update(datasets_fetched)

        for dataset in result['results']:

            dataset_record = Dataset.create(
                dataset_id=dataset['id'],
                title=trim_char_data(dataset['title']),
                license_title=trim_char_data(['license_title']),
                fetch_index=fetch_index,
            )

            for resource in dataset['resources']:
                if resource['format'] == DATA_FORMAT:
                    Resource.create(
                        resource_id=resource['id'],
                        dataset=dataset_record,
                        format=resource['format'],
                        url=resource['url'],
                    )

        time.sleep(REQUEST_DELAY)  # enforce a pause between each fetch to be respectful to API
        last_page = datasets_fetched >= result['count']

    if show_progress:
        progress_bar.finish()
Example #44
0
    file_dataset = "df_σ02_350_08Х18Н10Т.json"
    target_mech = "σ0,2_350"
    norm_mech = "σ0,2_350_norm"
    target = "is_defect"

    with open(file_dataset, 'r') as f:
        df = pd.DataFrame(json.loads(f.read()))

    print("Dataset: read is done!")

    output = defaultdict(list)

    for thr in tqdm.tqdm([1.01, 1.02, 1.03, 1.04, 1.05, 1.06, 1.07, 1.08, 1.09, 1.1], desc="Thr"):
        df_train = df.assign(is_defect=lambda row: (row[target_mech] - thr * row[norm_mech] < 0).astype(int)).drop([target_mech, norm_mech], axis=1)
        share = df_train[target].mean()
        d = Dataset(data=json.dumps(df_train.select_dtypes(np.number).to_dict('records')),
                    features=df_train.select_dtypes(np.number).drop(target, axis=1).columns, target=target)
        m = MlModel(model_type='RandomForestClassifier')
        search_space = OptParams(model_type=type(m.get_model()).__name__)
        opt = Opt(data=d,
                  params=search_space,
                  pipeline=m,
                  metric=partial(precision_score, zero_division=0),
                  trials=Trials()
                  )

        opt.start_opt()

        output['thr'] += [thr]
        output['share'] += [share]
        output['best_trial'] += [opt.trials.best_trial['result']]
Example #45
0
    def update_model(self):
        """
        Update XGboost model (gbm), using relevant data.
        This function using model Films and Users, please don't change their.
        :return: accuracy of the new model
        """

        # load the list of users
        count = Users.select().count()
        users = []
        for i in range(0, count, 100):
            usrs = Users.select().offset(i).limit(100).execute()
            for u in usrs:
                users.append(model_to_dict(u))

        # collect dataset
        dataset = []
        for i in range(0, count, 200):
            data = Dataset.select().order_by(
                Dataset.id).offset(i).limit(200).execute()
            for d in data:
                dataset.append(model_to_dict(d))
        dataset = self.filtr(dataset)
        dataset = [{
            "data": self.full_data(d["film"], d["user"]),
            "result": d["result"]
        } for d in dataset]

        X = [d["data"] for d in dataset]
        Y = [int(d["result"] > 0) for d in dataset]

        from sklearn.preprocessing import normalize
        X = normalize(X)

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.1,
                                                            random_state=42)

        # learning new model
        if MODEL == "gbm":
            model = xgb.XGBClassifier(
                max_depth=7,
                n_estimators=1600,
                learning_rate=0.01,
                subsample=0.3,
                #                         gamma = 300,
                colsample_bytree=0.3).fit(X_train, y_train)

        else:
            pool = Pool(X_train, y_train)
            model = CatBoostClassifier(iterations=1600,
                                       learning_rate=0.01,
                                       depth=5,
                                       random_seed=7)
            model.fit(pool)

        @not_test
        def save():  # save model
            import pickle
            pickle.dump(model, open(PATH_TO_DIR + "model", "wb"))

        # compute accuracy
        predictions = model.predict_proba(X_test)[:, 1]
        from sklearn.metrics import roc_auc_score
        test = roc_auc_score(predictions > 0.5, y_test)

        return test
Example #46
0
 def test_make_FragmentationSpectrum(self):
     d1 = Dataset(name='Dataset1')
     d1.save()
     FragmentationSpectrum(precursor_mz='123.456',
                           spec_num=0, dataset=d1).save()
     self.assertEqual(FragmentationSpectrum.objects.all().count(), 1)
Example #47
0
def main(infile):

    # LOAD TRAIN SET
    dataset_train = Dataset.DatasetReview()
    dataset_train.load_review_from_csv(train_set)

    # LOAD TEST SET
    dataset_test = Dataset.DatasetReview()
    dataset_test.load_review_from_csv(test_set)

    # preprocessor = DatasetPreprocessor()
    # dataset = preprocessor.fold_cases_d(dataset)
    # dataset = preprocessor.remove_punctuations_d(dataset)
    # dataset = preprocessor.convert_numbers_d(dataset)

    # dataset.export_only_contents("../Test/dataset.txt")

    # fe = BagFeatureExtractor(dataset.get_contents())
    # fe.build()
    # fe.save_vocab("../Test/vocab.txt")

    # dataset.export_formatted_dataset("formatted_dataset_wow.tsv")

    print "\n**** CROSS VALIDATION EVALUATION (CORPUS: WIKIPEDIA) ****\n"

    fe = BagFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = TfidfFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile=wiki_w2v_model,
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile="vectors_full_wow.txt",
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    fe = SennaFeatureExtractor(dataset_train.get_contents(),
                               infile="../senna_vectors.txt",
                               vocabfile="../senna_vocab.txt")
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_cross_validation(classifier, fe, dataset_train)

    print "\n**** TRAINING SET EVALUATION (CORPUS: WIKIPEDIA) ****\n"

    fe = BagFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = TfidfFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile=wiki_w2v_model,
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full_wow.txt", vocabfile="vocab_full_wow.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile="vectors_full_wow.txt",
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    fe = SennaFeatureExtractor(dataset_train.get_contents(),
                               infile="../senna_vectors.txt",
                               vocabfile="../senna_vocab.txt")
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_training_set(classifier, fe, dataset_train)

    print "TEST SET EVALUATION (CORPUS: WIKIPEDIA)"

    fe = BagFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = TfidfFeatureExtractor(dataset_train.get_contents())
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile=wiki_w2v_model,
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full_wow.txt", vocabfile="vocab_full_wow.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = WordEmbeddingFeatureExtractor(dataset_train.get_contents(),
                                       infile="vectors_full_wow.txt",
                                       binary=False,
                                       dimen=200)
    # fe.save_model_to_file("vectors_full.txt", vocabfile="vocab_full.txt", binary=False)
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)

    fe = SennaFeatureExtractor(dataset.get_contents(),
                               infile="../senna_vectors.txt",
                               vocabfile="../senna_vocab.txt")
    classifier = Classifier(models="svm")
    ev = Evaluator()
    ev.eval_with_test_set(classifier, fe, dataset_train, dataset_test)
def main():
    """ Sentiment Specific Embedding for twitter classification """

    embeddings_size = 50  # Embedding size for SSWE model
    vocab_file = "Embedding/features/semeval_vocabs_200.txt"  # path to the vocabulary file
    vector_file = "Embedding/features/semeval_vectors_200.txt"  # path to the vector file
    stopwordsfile = "preprocess/stopwords.txt"
    """     Sentiment-Specific Word Embedding (SSWE)    """

    if True:
        # Load dataset
        data_train = 'dataset/training1600000.csv'  # training data set file path
        pre_data_train = 'dataset/preprocessed_dataset1600000.csv'  # file to save dataset after cleaning

        if True:
            print("\n **** Dataset cleaning ****")
            tweets_prepocess(data_train, pre_data_train, stopwordsfile)

        if True:
            print("\n **** SSWE model Trainig ****")
            train_model = None  # path to the file contains the trained model if it is already exist
            save_model = "Embedding/models/SSWE_model_1600000_200"  # path to the file where model will be saved
            sswe = create_sswe_model(pre_data_train, vocab_file, vector_file,
                                     train_model, save_model, embeddings_size)
            sswe_trainer(sswe)
    """     Embedding visualisation and Similarity computing    """

    if True:
        visualiser = Visualiser(
            sizeOfEmbedding=embeddings_size,
            VocabsFname=vocab_file,
            VectorsFname=vector_file,
            WVFilename="Visualisation/data/w2vformat.txt",
            visualizerHTMLfilename="Visualisation/data/embedding.html")
        visualiser.visualize()
    """ Twitter Sentiment Classification """

    if True:
        # Data pre-processing

        print("\n **** Training data cleaning ****")
        pre_processing_train = "dataset/preprocessed_semeval_traindataset.csv"
        # tweets_prepocess(train_set, pre_processing_train, stopwordsfile)

        print("\n **** Test data cleaning ****")
        pre_processing_test = "dataset/preprocessed_semeval_testdataset.csv"
        # tweets_prepocess(test_set, pre_processing_test, stopwordsfile)

        # LOAD TRAIN SET
        dataset_train = Dataset.DatasetReview()
        dataset_train.load_review_from_csv(pre_processing_train)

        # LOAD TEST SET
        dataset_test = Dataset.DatasetReview()
        dataset_test.load_review_from_csv(pre_processing_test)

        ################################### Neural Nets classifier ###########################

        # Extract Features
        tweet2v = get_sswe_features(vocab_file, vector_file)

        # Extract samples and labels
        x_train, y_train = split_data(dataset_train)
        x_test, y_test = split_data(dataset_train)

        tfidf = build_tfidf(x_train)

        train_vecs_sswe = np.concatenate([
            buildWordVector(z.split(), embeddings_size, tweet2v, tfidf)
            for z in tqdm(map(lambda x: x, x_train))
        ])

        train_vecs_sswe = scale(train_vecs_sswe)

        test_vecs_sswe = np.concatenate([
            buildWordVector(z.split(), embeddings_size, tweet2v, tfidf)
            for z in tqdm(map(lambda x: x, x_test))
        ])
        test_vecs_sswe = scale(test_vecs_sswe)

        # neural network model
        neuralnets = NeuralNets(input_size=embeddings_size,
                                x_train=train_vecs_sswe,
                                y_train=y_train,
                                epochs=450,
                                batch_size=32,
                                x_test=test_vecs_sswe,
                                y_test=y_test)
        neuralnets.train_neural_nets()

        ##########################################################################################
        ########
        ########        Classical classifiers with sklearn
        ########
        ##########################################################################################
        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")

        fe_sswe = SennaFeatureExtractor(infile=vector_file,
                                        vocabfile=vocab_file,
                                        dimen=embeddings_size)
        feature_extractors = [fe_sswe]
        ev = Evaluator()

        ################################# SVM ###################################################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="svm")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)

        ################################### Naive bayes ##########################################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="multinomial")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: DATASET) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)

        #########################################  RandomForestClassifier #######################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="rfc")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)

        #########################################  MLPClassifier #######################

        print("\n**** CROSS VALIDATION EVALUATION (CORPUS: SemEval) ****\n")
        model = Classifier(models="nn")
        kfold = KFold(n_splits=10)
        ev.eval_with_cross_validation(model,
                                      feature_extractors=feature_extractors,
                                      training_set=dataset_train,
                                      num_fold=10,
                                      cv=kfold)
        ev.create_evaluation_result(model,
                                    feature_extractors=feature_extractors,
                                    training_set=dataset_train,
                                    num_fold=10,
                                    cv=kfold)

        print("\n**** TEST SET EVALUATION (CORPUS: SemEval) ****\n")
        ev.eval_with_test_set(model,
                              feature_extractors=feature_extractors,
                              training_set=dataset_train,
                              test_set=dataset_test)