Ejemplo n.º 1
0
    def publish(self):
        """Publish a run to the OpenML server.

        Uploads the results of a run to OpenML.
        Sets the run_id on self

        Returns
        -------
        self : OpenMLRun
        """
        if self.model is None:
            raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");
        if self.flow_id is None:
            raise PyOpenMLError("OpenMLRun obj does not contain a flow id. (Should have been added while executing the task.) ");

        description_xml = self._create_description_xml()
        file_elements = {'description': ("description.xml", description_xml)}

        if self.error_message is None:
            predictions = arff.dumps(self._generate_arff_dict())
            file_elements['predictions'] = ("predictions.arff", predictions)

        if self.trace_content is not None:
            trace_arff = arff.dumps(self._generate_trace_arff_dict())
            file_elements['trace'] = ("trace.arff", trace_arff)

        return_value = _perform_api_call("/run/", file_elements=file_elements)
        run_id = int(xmltodict.parse(return_value)['oml:upload_run']['oml:run_id'])
        self.run_id = run_id
        return self
Ejemplo n.º 2
0
    def publish(self):
        """Publish a run to the OpenML server.

        Uploads the results of a run to OpenML.
        Sets the run_id on self

        Returns
        -------
        self : OpenMLRun
        """
        if self.model is None:
            raise PyOpenMLError("OpenMLRun obj does not contain a model. (This should never happen.) ");
        if self.flow_id is None:
            raise PyOpenMLError("OpenMLRun obj does not contain a flow id. (Should have been added while executing the task.) ");

        description_xml = self._create_description_xml()
        file_elements = {'description': ("description.xml", description_xml)}

        if self.error_message is None:
            predictions = arff.dumps(self._generate_arff_dict())
            file_elements['predictions'] = ("predictions.arff", predictions)

        if self.trace_content is not None:
            trace_arff = arff.dumps(self._generate_trace_arff_dict())
            file_elements['trace'] = ("trace.arff", trace_arff)

        return_value = _perform_api_call("/run/", file_elements=file_elements)
        run_id = int(xmltodict.parse(return_value)['oml:upload_run']['oml:run_id'])
        self.run_id = run_id
        return self
Ejemplo n.º 3
0
    def publish(self) -> 'OpenMLRun':
        """ Publish a run (and if necessary, its flow) to the OpenML server.

        Uploads the results of a run to OpenML.
        If the run is of an unpublished OpenMLFlow, the flow will be uploaded too.
        Sets the run_id on self.

        Returns
        -------
        self : OpenMLRun
        """
        if self.model is None:
            raise PyOpenMLError("OpenMLRun obj does not contain a model. "
                                "(This should never happen.) ")
        if self.flow_id is None:
            if self.flow is None:
                raise PyOpenMLError(
                    "OpenMLRun object does not contain a flow id or reference to OpenMLFlow "
                    "(these should have been added while executing the task). "
                )
            else:
                # publish the linked Flow before publishing the run.
                self.flow.publish()
                self.flow_id = self.flow.flow_id

        if self.parameter_settings is None:
            if self.flow is None:
                self.flow = openml.flows.get_flow(self.flow_id)
            self.parameter_settings = self.flow.extension.obtain_parameter_values(
                self.flow,
                self.model,
            )

        description_xml = self._create_description_xml()
        file_elements = {'description': ("description.xml", description_xml)}

        if self.error_message is None:
            predictions = arff.dumps(self._generate_arff_dict())
            file_elements['predictions'] = ("predictions.arff", predictions)

        if self.trace is not None:
            trace_arff = arff.dumps(self.trace.trace_to_arff())
            file_elements['trace'] = ("trace.arff", trace_arff)

        if self.additional_information is not None:
            for (name, (file,
                        contents)) in self.additional_information.items():
                file_elements[name] = (file, contents)

        return_value = openml._api_calls._perform_api_call(
            "/run/", 'post', file_elements=file_elements)
        result = xmltodict.parse(return_value)
        self.run_id = int(result['oml:upload_run']['oml:run_id'])
        return self
Ejemplo n.º 4
0
 def toArffString(self):
     data = dict()
     data.update({"description": self.getDescription()})
     data.update({"relation": self.relationName()})
     attributes = []
     for i in range(self.numAttributes()):
         t = []
         t.append(self.attribute(i).name())
         if self.attribute(i).type() == Attribute.NUMERIC:
             t.append("REAL")
         else:
             t.append(self.attribute(i).m_AttributeInfo.m_Values)
         attributes.append(tuple(t))
     data.update({"attributes": attributes})
     datas = []
     for i in range(self.numInstances()):
         val = []
         for j in range(self.numAttributes()):
             if self.instance(i).isMissing(j):
                 val.append(None)
             elif self.attribute(j).isNominal():
                 val.append(
                     self.attribute(j).value(self.instance(i).value(j)))
             else:
                 val.append(str(self.instance(i).value(j)))
         datas.append(val)
     data.update({"data": datas})
     text = arff.dumps(data)
     return text
Ejemplo n.º 5
0
def store_as_arff(data,
                  labels,
                  relation,
                  path,
                  description=u'',
                  attributes=ATTRIBUTES):
    """
    Writes feature data to an ARFF file.
    :param data: The data set. (Without class labels)
    :param labels: Class labels in the same order as the data instances in ``data``
    :param relation: The name of the relation
    :param path: The full path to the file to write to.
    :param description: A description of the relation.
    :param attributes: The attributes in the data set and their levels
    """
    attributes.append(('@@TRUTH@@', ['Y', 'N']))

    new_data = []
    for i in range(len(data)):
        l = list(data[i])
        l += labels[i]
        new_data.append(l)

    arff_data = {
        'description': description,
        'relation': relation,
        'attributes': attributes,
        'data': new_data,
    }
    fh = open(path, 'w')
    fh.write(liac_arff.dumps(arff_data))
    fh.close()
Ejemplo n.º 6
0
def partg():
	data = arff.load(open(filename))

	# find the attribute index
	for attribute_index in range(len(data['attributes'])):
		if data['attributes'][attribute_index][0] == attribute:
			break

	if attribute_index == len(data['attributes']):
		print("The given attribute name does not exist.")
		exit(1)

	attribute_type = data['attributes'][attribute_index][1]

	# Add attribute 10 times
	for i in range(10):
		# Add a new attribute
		data['attributes'].insert(0, (attribute + str(i), attribute_type))

		# Append a copy to each piece of data
		for j in range(len(data['data'])):
			data['data'][j].insert(0, data['data'][j][attribute_index])

	# Write a new file
	p4g_file = open('partg-' + filename, 'w')
	p4g_file.write(arff.dumps(data))
Ejemplo n.º 7
0
def vectors_to_arff_format():
    # To be used for Weka analysis
    data['data'] = read_feat()
    print(arff.dumps(data))


# compute_features_vector()
Ejemplo n.º 8
0
 def export(self, paintings):
     """Export the analysed data."""
     data = {'description': self.__class__.__name__, 
             'relation': 'year', 
             'attributes': self.get_attributes(), 
             'data': self.get_values(paintings)}
     return arff.dumps(data)
Ejemplo n.º 9
0
def store_as_arff(data, labels, relation, path, description=u'', attributes=ATTRIBUTES):
    """
    Writes feature data to an ARFF file.
    :param data: The data set. (Without class labels)
    :param labels: Class labels in the same order as the data instances in ``data``
    :param relation: The name of the relation
    :param path: The full path to the file to write to.
    :param description: A description of the relation.
    :param attributes: The attributes in the data set and their levels
    """
    attributes.append(('@@TRUTH@@', ['Y', 'N']))

    new_data = []
    for i in range(len(data)):
        l = list(data[i])
        l += labels[i]
        new_data.append(l)

    arff_data = {
        'description': description,
        'relation': relation,
        'attributes': attributes,
        'data': new_data,
    }
    fh = open(path, 'w')
    fh.write(liac_arff.dumps(arff_data))
    fh.close()
Ejemplo n.º 10
0
def findMissingValues(filename):
    with open(filename, 'r+') as af:
        arffFile = arff.load(af)
        data = arffFile['data']
        attributes = arffFile['attributes']
        numExamples = len(data)
        averages = []
        # loop over each attribute
        for index in range(len(attributes)):
            attr = attributes[index]
            average = '?'
            if isinstance(attr[1], list): # find mode if attr is classifier
                words_to_count = (row[index] for row in data if row[index] != None)
                c = Counter(words_to_count)
                average = c.most_common(1)[0][0] # stacks on stacks
            else: # find mean
                average = sum([row[index] for row in data if row[index] != None]) / numExamples
            averages.append(average)
        # udpate the missing values
        for row in data:
            for index in range(len(row)):
                if row[index] == None:
                    row[index] = averages[index]
        # overwrite the file
        af.seek(0)
        af.write(arff.dumps(arffFile))
        af.truncate()
        return data
Ejemplo n.º 11
0
def cccCentring(ra, combnk, files, aRa, rGoldIndiv):
    for i in range(len(v.eName)):
        for f, fname in enumerate(files[i][0]):
            meanByF = []
            wghRater = []
            csv = rGoldIndiv[v.eName[i]][f]
            #Firstly we compute the mean of all raters for each file
            for a in range(v.nAn):
                #We get the mean
                meanRatersF = np.nanmean(csv[:, a + 1])
                meanByF.append(meanRatersF)
                #We take the weight of the rater in this file
                wghRater.append(aRa[a][i][f])
            #Now we calculate the ponderate mean of all raters
            pondMean = np.sum(np.multiply(meanByF, wghRater)) / np.sum(
                aRa[:, i, f])
            #We have the mean of all raters, we need the total mean of the file
            meanF = np.nanmean(csv[:, 1:])
            #Now we will center each prediction according to the mean
            output = []
            #We prepare the ARFF file, we get the template
            data = arff.load(open(v.arffTempPath, 'rb'))
            for line in range(len(csv) - 1):
                meanLine = np.nanmean(csv[line + 1, 1:])
                newGs = meanLine - meanF + pondMean
                #We replace the values in the ARFF template
                data["data"][line][0] = fname.replace(".csv", "")
                data["data"][line][1] = round(csv[line + 1, 0], 2)
                data["data"][line][2] = round(newGs, 6)
            #We write the csv in the Gold Standard folder
            f = open(v.agsc[i] + fname.replace(".csv", ".arff"), "w")
            f.write(arff.dumps(data))
    return None
Ejemplo n.º 12
0
def splitFile(filename):
    numClasses = 0
    classData = dict()
    with open(filename, 'rb') as af:
        arffFile = arff.load(af)
        attributes = arffFile['attributes']
        classes = attributes[-1][1]
        # replaces empty list for Firstyrcumgpa
        template['attributes'][-1] = (template['attributes'][-1][0], classes)
        numClasses = len(classes)
        for c in classes: 
            classData.setdefault(c, [])
        arffData = arffFile['data']
        for row in arffData:
            if row[-1] != None:
                cList = classData[row[-1]]
                cList.append(row)
        # save each key of classData to a sepporate arff
        filenum = 0
        for key, data in classData.items():
            template['data'] = data
            with open(temp_dir + '\o%g.arff' % filenum, 'w') as arffFile:
                arffFile.write(arff.dumps(template))
            filenum += 1
    return numClasses
Ejemplo n.º 13
0
def make_arff(symbol, preparer, attributes):
    symbol_quotes = list(quotes.find({'Symbol': symbol}))

    data = [preparer(instance, i, symbol_quotes)
            for i, instance in enumerate(symbol_quotes[:-1])]

    return arff.dumps(data, relation=RELATION_NAME % symbol,
                      names=attributes)
Ejemplo n.º 14
0
def do_run(task, optimizer, output_dir, internet_access=True, publish=False):
    if internet_access:
        run = openml.runs.run_model_on_task(task, optimizer)
        score = run.get_metric_fn(sklearn.metrics.accuracy_score)
        print('%s [SCORE] Data: %s; Accuracy: %0.2f' %
              (openmlpimp.utils.get_time(), task.get_dataset().name,
               score.mean()))
        if publish:
            run = run.publish()

        run_xml = run._create_description_xml()
        predictions_arff = arff.dumps(run._generate_arff_dict())

        with open(output_dir + '/run.xml', 'w') as f:
            f.write(run_xml)
        with open(output_dir + '/predictions.arff', 'w') as f:
            f.write(predictions_arff)

        if run.trace_content is not None:
            trace_arff = arff.dumps(run._generate_trace_arff_dict())
            with open(output_dir + '/trace.arff', 'w') as f:
                f.write(trace_arff)
        return run
    else:
        res = openml.runs.functions._run_task_get_arffcontent(
            optimizer, task, task.class_labels)
        run = openml.runs.OpenMLRun(task_id=task.task_id,
                                    dataset_id=None,
                                    flow_id=None,
                                    model=optimizer)
        run.data_content, run.trace_content, run.trace_attributes, run.fold_evaluations, _ = res
        score = run.get_metric_fn(sklearn.metrics.accuracy_score)

        print('%s [SCORE] Data: %s; Accuracy: %0.2f' %
              (openmlpimp.utils.get_time(), task.get_dataset().name,
               score.mean()))

        if run.trace_content is not None:
            trace_arff = arff.dumps(run._generate_trace_arff_dict())
            with open(output_dir + '/trace.arff', 'w') as f:
                f.write(trace_arff)

        predictions_arff = arff.dumps(run._generate_arff_dict())
        with open(output_dir + '/predictions.arff', 'w') as f:
            f.write(predictions_arff)
        return run
Ejemplo n.º 15
0
def dumps(df):
    """
    dump DataFrame to str
    :param DataFrame df: 
    :rtype: str
    :return: dumped arff
    """
    arff = __dump(df)
    return liacarff.dumps(arff)
Ejemplo n.º 16
0
def parth():
	data = arff.load(open(filename))

	for i in range(20):
		data['attributes'].insert(0, ('RANDOM' + str(i), 'NUMERIC'))
		for j in range(len(data['data'])):
			data['data'][j].insert(0, random.random())

	p4h_file = open('parth-' + filename, 'w')
	p4h_file.write(arff.dumps(data))
Ejemplo n.º 17
0
    def test_encode_destiny(self):
        src = ARFF_DESTINY

        count = 0
        while count < 10:
            count += 1

            obj = arff.loads(src)
            src = arff.dumps(obj)
            self.assertEqual(src, ARFF_DESTINY)
Ejemplo n.º 18
0
    def VTiter(self, *parsedArgs, **envars):
        import arff
        largs, dictargs = self.full_parse(parsedArgs)

        self.nonames = True
        self.names = []
        self.types = []
        data = {}

        if 'query' not in dictargs:
            raise functions.OperatorError(
                __name__.rsplit('.')[-1], "No query argument ")
        query = dictargs['query']

        cur = envars['db'].cursor()
        c = cur.execute(query)

        schema = cur.getdescriptionsafe()
        raw = []
        try:
            first_row = c.next()
            raw.append(first_row)
        except:
            f = open('input.arff', 'w')
            f.write(
                "@RELATION hour-weka.filters.unsupervised.attribute.Remove-R1-2\n\n"
            )
            for j in schema:
                f.write("@ATTRIBUTE %s NUMERIC\n" % (j[0], ))
            f.write("\n@DATA\n")
            yield (('result', ), )
            yield (1, )
            return

        updated_schema = []

        for i in range(len(schema)):
            t = (schema[i][0], "NUMERIC")
            updated_schema.append(t)

        data[u'attributes'] = updated_schema

        for row in c:
            raw.append(row)

        data[u'data'] = raw
        data[u'description'] = u''
        data[
            u'relation'] = u'hour-weka.filters.unsupervised.attribute.Remove-R1-2'

        f = open('input.arff', 'w')
        f.write(arff.dumps(data))

        yield (('result', ), )
        yield (1, )
Ejemplo n.º 19
0
def run_experiment(rscv, task, args):
    try: 
        count = 1
        while count <= 100:
            try:
                print("%s Started classifier %s, condition %s, parameter '%s', deftype '%s', RS seed %s on task %s, dataset '%s'." % (hyperimp.utils.get_time(), args.classifier, args.condition, args.param, args.deftype, args.seed, args.task_id, task.get_dataset().name))
                # train model
                run = train_model(task, rscv)
                break
            except openml.exceptions.OpenMLServerError as e:
                if count == 100:
                    print("%s OpenMLServerError in run, I tried this 100 times already, so I'm just going to continue to the next run." % (hyperimp.utils.get_time()))
                    raise
                sleeptime = randint(5,60)
                print("%s Error in run, trying again in %d seconds. Message: %s" % (hyperimp.utils.get_time(), sleeptime, e))
                count += 1
                sleep(sleeptime)
        run.tags.append('study_%s' %str(args.study_id))
        score = run.get_metric_fn(sklearn.metrics.accuracy_score)
        print('%s [SCORE] Accuracy: %0.2f.' % (hyperimp.utils.get_time(), score.mean()))
        
        if args.log:
            # log xml, predictions 
            output_dir = args.output_dir + '/' + args.classifier + '/task_' + str(args.task_id) + '/' + str(args.condition)
            os.makedirs(output_dir)
            run_xml = run._create_description_xml()
            predictions_arff = arff.dumps(run._generate_arff_dict())
            with open(output_dir + '/run.xml', 'w') as f:
                f.write(run_xml)
            with open(output_dir + '/predictions.arff', 'w') as f:
                f.write(predictions_arff)
        else:
            None
            
        count_run = 1
        while count_run <= 100:
            try:
                # publish run on OpenML
                run.publish()
                break
            except openml.exceptions.OpenMLServerError as e:
                if count_run == 100:
                    print("%s OpenMLServerError in run, I tried uploading this 100 times already, so I'm just going to continue to the next run." % (hyperimp.utils.get_time()))
                    raise
                sleeptime_run = randint(5,60)
                print("%s Error in uploading run trying again in %d seconds. Message: %s" % (hyperimp.utils.get_time(), sleeptime_run, e))
                count_run += 1
                sleep(sleeptime_run)
        print("%s Uploaded run condition %s, parameter %s, RS seed %s, task %s, with run id %d." % (hyperimp.utils.get_time(), args.condition, args.param, args.seed, args.task_id, run.run_id))
    except TimeoutError as e:
        print("%s Run timed out." % (hyperimp.utils.get_time()))
    except Exception as e:
        print("%s Error in run: %s" % (hyperimp.utils.get_time(), e))
        traceback.print_exc()
    return
Ejemplo n.º 20
0
def csv_to_arff(X, label_i, savePath, datatype, isTrain=True):
    # get attributes
    if datatype == 'real':
        attributes = [(X.columns[i], u"REAL") for i in range(len(X.columns))]
        attributes.append(('label_' + label_i.name, ['0', '1']))
        data = []
        i = 0
        while i < len(label_i):
            attr_data = [j for j in list(X.iloc[i, :])]
            label_data = [str(label_i[i])]
            row_data = attr_data + label_data
            data.append(row_data)
            i += 1
        # set obj
        obj = {
            'description': u'',
            'relation': 'relation',
            'attributes': attributes,
            'data': data,
        }
    elif datatype == "nominal":
        attributes = [('attr_' + X.columns[i], ['0', '1'])
                      for i in range(len(X.columns))]
        attributes.append(('label_' + label_i.name, ['0', '1']))
        data = []
        i = 0
        while i < len(label_i):
            attr_data = [str(int(j)) for j in list(X.iloc[i, :])]
            label_data = [str(label_i[i])]
            row_data = attr_data + label_data
            data.append(row_data)
            i += 1
        # set obj
        obj = {
            'description': u'',
            'relation': 'relation',
            'attributes': attributes,
            'data': data,
        }

    else:
        raise TypeError("datatype.")

    arff_data = arff.dumps(obj)
    if isTrain:
        #w_file = open(savePath+label_i.name+"_train.arff", "w")
        w_file = open(savePath + "/train.arff", "w")
        w_file.write(arff_data)
        w_file.close()
    elif not isTrain:
        w_file = open(savePath + "/test.arff", "w")
        w_file.write(arff_data)
        w_file.close()
    else:
        raise (ValueError, "what type of dataset?")
Ejemplo n.º 21
0
    def test_simple(self):
        dumps = self.get_dumps()
        s = dumps(OBJ)
        self.assertEqual(s, ARFF)

        count = 0
        while count < 10:
            count += 1
            obj = arff.loads(s)
            src = arff.dumps(obj)
            self.assertEqual(src, ARFF)
Ejemplo n.º 22
0
    def publish(self):
        """Publish a run to the OpenML server.

        Uploads the results of a run to OpenML.
        """
        predictions = arff.dumps(self._generate_arff())
        description_xml = self._create_description_xml()
        data = {'predictions': ("predictions.csv", predictions),
                'description': ("description.xml", description_xml)}
        return_code, return_value = _perform_api_call(
            "/run/", file_elements=data)
        return return_code, return_value
Ejemplo n.º 23
0
    def _get_file_elements(self) -> Dict:
        """ Get file_elements to upload to the server.

        Derived child classes should overwrite this method as necessary.
        The description field will be populated automatically if not provided.
        """
        if self.model is None:
            raise PyOpenMLError(
                "OpenMLRun obj does not contain a model. "
                "(This should never happen.) "
            )
        if self.flow_id is None:
            if self.flow is None:
                raise PyOpenMLError(
                    "OpenMLRun object does not contain a flow id or reference to OpenMLFlow "
                    "(these should have been added while executing the task). "
                )
            else:
                # publish the linked Flow before publishing the run.
                self.flow.publish()
                self.flow_id = self.flow.flow_id

        if self.parameter_settings is None:
            if self.flow is None:
                self.flow = openml.flows.get_flow(self.flow_id)
            self.parameter_settings = self.flow.extension.obtain_parameter_values(
                self.flow,
                self.model,
            )

        file_elements = {'description': ("description.xml", self._to_xml())}

        if self.error_message is None:
            predictions = arff.dumps(self._generate_arff_dict())
            file_elements['predictions'] = ("predictions.arff", predictions)

        if self.trace is not None:
            trace_arff = arff.dumps(self.trace.trace_to_arff())
            file_elements['trace'] = ("trace.arff", trace_arff)
        return file_elements
Ejemplo n.º 24
0
    def store(self, graph_builder,  **kwargs):
        """ Stores the graph content in Conll format into the object file.
        :param graph: The graph to store.
        :param kwargs: Unused
        """
        import arff
        data = []
        entities = graph_builder.get_all_coref_entities()
        self.logger.debug("entities: %s", len(entities))
        for entity in entities:
            mentions = graph_builder.get_all_entity_mentions(entity)
            self.logger.debug("Mentions: %s", len(mentions))
            for mention in mentions:
                surface_learn = mention.get("surface_learn", {})

                self.logger.debug("links: %s", len(surface_learn))
                for link in surface_learn:
                    data.append(surface_learn[link])
                    self.logger.debug("link: %s: ", surface_learn[link])

        boolean = [str(True), str(False)]

        arff_file = {
            'attributes': [
                ('relax_match', boolean),
                ('mention_enumeration', boolean),
                ('candidate_enumeration', boolean),
                ('mention_appositive', boolean),
                ('candidate_appositive', boolean),
                ('equal_names', 'REAL'),
                ('equal_adjectives', 'REAL'),
                ('equal_rest', 'REAL'),
                ('extra_mention_names', 'REAL'),
                ('extra_mention_adjectives', 'REAL'),
                ('extra_mention_rest', 'REAL'),
                ('extra_candidate_names', 'REAL'),
                ('extra_candidate_adjectives', 'REAL'),
                ('extra_candidate_rest', 'REAL'),
                ('mention', 'STRING'),
                ('candidate', 'STRING'),
                ('sentence_distance', 'INTEGER'),
                

                ('linked', boolean)],
            'description': 'surface relations between mentions',
            'relation': 'surface',
            'data': data
        }
        if len(data):
            self.file.write(arff.dumps(arff_file))
        else:
            self.logger.info("Empty data")
Ejemplo n.º 25
0
def export_arff(file_name: str, data: DataFrame, attributes, description: str):
    if description is None:
        description = ''
    if attributes is None:
        attributes = [(attribute, "STRING") for attribute in data.columns]
    arff_dict = {
        'description': description,
        'relation': file_name,
        'attributes': attributes,
        'data': data.values.tolist()
    }
    return "data:text/arff;charset=utf-8," + \
           urllib.parse.quote(arff.dumps(arff_dict))
Ejemplo n.º 26
0
 def arffGenerator(self, dataList, rawNamesList, group):
   namesList = []
   for name in rawNamesList:
     namesList.append((name,'REAL'))
   arffDict = {}
   arffDict['description'] = u''
   arffDict['relation'] = 'perfEvents'
   arffDict['attributes'] = namesList
   arffDict['data'] = dataList
  # print arff.dumps(arffDict)
   outFile = os.path.join(self.dumpTo, group + '.arff')
   with open(outFile, 'w') as f:
     f.write(arff.dumps(arffDict))
Ejemplo n.º 27
0
 def arffGenerator(self, dataList, rawNamesList, group):
     namesList = []
     for name in rawNamesList:
         namesList.append((name, 'REAL'))
     arffDict = {}
     arffDict['description'] = u''
     arffDict['relation'] = 'perfEvents'
     arffDict['attributes'] = namesList
     arffDict['data'] = dataList
     # print arff.dumps(arffDict)
     outFile = os.path.join(self.dumpTo, group + '.arff')
     with open(outFile, 'w') as f:
         f.write(arff.dumps(arffDict))
Ejemplo n.º 28
0
def transform(file, dont_care_category, a, attributes_file):
    """input example
    [ {id: 123, text: "this is text body", category: ["dont_care"]} ]
    output example
    @relation game_media_bot

    @attribute

    :return:
    """
    classes = set()
    data = json.load(open(file, 'r'))

    master_vector = Counter()

    for tweet in data:
        classes.add(tweet['category'][0])
        if tweet['category'][0] != dont_care_category:
            master_vector += get_word_vector(tweet)

    print(master_vector)

    attrs = None
    if not attributes_file:
        # most common words in the text of the target category
        attrs = [(word, 'INTEGER') for word, _ in master_vector.most_common(a)]
        attrs.append(('class', [value for value in classes]))
    else:
        # load attributes from this file
        arff_data = arff.load(open(attributes_file, 'r'))
        attrs = arff_data['attributes']

    arff_data = {
        'attributes': attrs,
        'data': [],
        'description': '',
        'relation': '{}'.format(dont_care_category)
    }

    for tweet in data:
        word_vector = get_word_vector(tweet)
        tweet_data = [word_vector[attr[0]] for attr in attrs[:-1]]
        tweet_data.append(tweet['category'][0])
        arff_data['data'].append(tweet_data)

    out_file = file.replace('.json', '.arff')
    data = arff.dumps(arff_data)
    with open(out_file, 'w') as f:
        f.write(data)

    return out_file
Ejemplo n.º 29
0
def md2arff():
    ri_obj_list = get_repo_info(to_dict=False, combine_star_events=True)
    repo_set = set()
    data = dict()
    data['attributes'] = attrs
    data['description'] = ''
    data['relation'] = 'readme'
    readme_file_set = set()
    inline_data = list()
    for ri in ri_obj_list:
        if (ri.repo_owner, ri.repo_name) in repo_set:
            continue
        repo_set.add((ri.repo_owner, ri.repo_name))
        paper_repo_owner = getattr(ri, 'paper_repo_owner')
        paper_repo_name = getattr(ri, 'paper_repo_name')
        repo_path = os.path.join(conf.repo_path, paper_repo_owner, paper_repo_name)

        assert os.path.exists(repo_path)
        file_list = os.listdir(repo_path)
        readme_path = ''
        for f in file_list:
            if f.lower().startswith('readme.'):
                readme_path = os.path.join(repo_path, f)
                break
        if readme_path == '':
            readme_content = ''
        else:
            with open(readme_path, 'r', encoding='utf-8', errors='ignore') as readme_f:
                readme_content = readme_f.read()

        if readme_path != '' and f.lower() == 'readme.md':
                readme_content = parse_markdown(readme_content)

        readme_content = readme_content.lower()
        readme_content = readme_content.replace('\n', ' ')
        readme_content = readme_content.replace('\"', ' ')
        readme_content = readme_content.replace('\'', ' ')
        inline_data_unit = list()
        if ri.stars_count >= threshold:
            inline_data_unit.append('popular')
        else:
            inline_data_unit.append('unpopular')
        inline_data_unit.append(readme_content)
        inline_data.append(inline_data_unit)

    data['data'] = inline_data

    file_content = arff.dumps(data)
    arff_path = os.path.join(conf.root_path, 'text_analysis.arff')
    with open(arff_path, 'w', encoding='utf-8') as f:
        f.write(file_content)
Ejemplo n.º 30
0
    def _to_filesystem(self, file_path):
        """Serialize the trace object to the filesystem.

        Serialize the trace object as an arff.

        Parameters
        ----------
        file_path: str
            File path where the trace arff will be stored.
        """

        trace_arff = arff.dumps(self.trace_to_arff())
        with open(os.path.join(file_path, 'trace.arff'), 'w') as f:
            f.write(trace_arff)
Ejemplo n.º 31
0
    def _to_filesystem(self, file_path):
        """Serialize the trace object to the filesystem.

        Serialize the trace object as an arff.

        Parameters
        ----------
        file_path: str
            File path where the trace arff will be stored.
        """

        trace_arff = arff.dumps(self.trace_to_arff())
        with open(os.path.join(file_path, 'trace.arff'), 'w') as f:
            f.write(trace_arff)
Ejemplo n.º 32
0
def dump_dataset(features,
                 feature_format,
                 evaluations,
                 train_context,
                 format='arff',
                 positive_class=None):
    if format == 'arff':
        data = {
            'attributes': [],
            'data': [],
            'description': '',
            'relation': 'default'
        }
        n_features = len(features.splitlines())
        for i in range(1, n_features + 1):
            feature = ('f%d' % i, ['+', '-'])
            data['attributes'].append(feature)

        target = train_context.target_table
        if not target in train_context.orng_tables:
            raise Exception(
                'Target table is not preloaded in memory! Please select the `dump data` parameter in the converter widget.'
            )
        if feature_format == 'aleph':
            target_vals = ('negative', 'positive')
        else:
            orng_target = train_context.orng_tables[target]
            target_vals = tuple(sorted(orng_target.domain.classVar.values))
        class_attr = ('class', target_vals)
        data['attributes'].append(class_attr)
        for line in evaluations.splitlines():
            values = line.strip().split()
            if feature_format == 'aleph':
                class_val = values[-1]
                if class_val == positive_class:
                    values[-1] = 'positive'
                else:
                    values[-1] = 'negative'
            data['data'].append(values)
        return arff.dumps(data)

    elif format == 'csv':
        data = ''
        for line in evaluations.splitlines():
            values = line.strip().split()
            data = data + ','.join(values) + '\n'
        return data

    return 'unsupported format'
Ejemplo n.º 33
0
    def to_filesystem(
        self,
        directory: str,
        store_model: bool = True,
    ) -> None:
        """
        The inverse of the from_filesystem method. Serializes a run
        on the filesystem, to be uploaded later.

        Parameters
        ----------
        directory : str
            a path leading to the folder where the results
            will be stored. Should be empty

        store_model : bool, optional (default=True)
            if True, a model will be pickled as well. As this is the most
            storage expensive part, it is often desirable to not store the
            model.
        """
        if self.data_content is None or self.model is None:
            raise ValueError('Run should have been executed (and contain '
                             'model / predictions)')

        os.makedirs(directory, exist_ok=True)
        if not os.listdir(directory) == []:
            raise ValueError('Output directory {} should be empty'.format(
                os.path.abspath(directory)))

        run_xml = self._create_description_xml()
        predictions_arff = arff.dumps(self._generate_arff_dict())

        # It seems like typing does not allow to define the same variable multiple times
        with open(os.path.join(directory, 'description.xml'),
                  'w') as fh:  # type: TextIO
            fh.write(run_xml)
        with open(os.path.join(directory, 'predictions.arff'), 'w') as fh:
            fh.write(predictions_arff)
        if store_model:
            with open(os.path.join(directory, 'model.pkl'),
                      'wb') as fh_b:  # type: IO[bytes]
                pickle.dump(self.model, fh_b)

        if self.flow_id is None:
            self.flow.to_filesystem(directory)

        if self.trace is not None:
            self.trace._to_filesystem(directory)
Ejemplo n.º 34
0
 def normalizeFunction(self, data, technique, folderName, arffs):
     t = time.time()
     # Choose Standardization Technique
     if technique == 'MinMaxScaler':
         scaler = MinMaxScaler()
     if technique == 'StandardScaler':
         scaler = StandardScaler()
     if technique == 'MaxAbsScaler':
         scaler = MaxAbsScaler()
     if technique == 'RobustScaler':
         scaler = RobustScaler()
     print("Convertendo para Normalização {}..".format(technique))
     print("Transformando em Dataframe..")
     # Convert data into DataFrame
     df = pd.DataFrame(data['data'])
     # Get features number
     length = df.iloc[0, :].values
     print("Número de Features: {}".format(len(length) - 1))
     # Create a label column
     labels = df.iloc[:, len(length) - 1].values
     # Save features without labels
     data_aux = df.iloc[:, 0:(len(length) - 1)].values
     # Normalize data
     data_normalized = scaler.fit_transform(data_aux)
     # Adding the labels to normalized data
     data_normalized = np.concatenate((data_normalized, np.vstack(labels)),
                                      axis=1)
     # Replacing data with normalized samples
     data['data'] = data_normalized
     # Creating Folder if doesnt exists
     try:
         os.mkdir(folderName + "Normalized_Arffs", 755)
         print("Criando pasta onde será salvo os arquivos arffs..")
     except:
         print(
             "Pasta já existente, apenas sobrescrevendo os arquivos arffs.."
         )
     # Saving arff in text file
     print("Salvando arff..")
     newArffFile = open(
         folderName + "Normalized_Arffs/" + arffs[:-5] + "_" + technique +
         ".arff", "w")
     newArffFile.write(arff.dumps(data))
     newArffFile.close()
     print("Processo para o arquivo {} terminado. (Tempo de execução: {})".
           format(arffs,
                  time.time() - t))
     print("")
Ejemplo n.º 35
0
 def test_files(self):
     fname = os.path.join(SRC_DIR, 'example.arff')
     data = [
         ['blonde', 17.2, 1],
         ['blue', 27.2, 2],
         ['blue', 18.2, 3],
         ]        
     arff.dump(fname, data, relation='diabetics_data', names=('hair_color', 'age', 'patno'))
     data = list(arff.load(os.path.join(SRC_DIR, fname)))
     arff_rows = arff.dumps(data)
     reparsed_data = list(arff.loads(arff_rows))
     
     data = [list(row) for row in data]
     reparsed_data = [list(row) for row in reparsed_data]
     
     self.assertEqual(data, reparsed_data)
Ejemplo n.º 36
0
def generate(arff_file):
    ou = open(arff_file, "w")
    dataset = {
        'description': 'Motion sensor dataset',
        'relation': 'whatever',
        'attributes': [
            ('chair_prev', 'REAL'),
            ('bath_prev', 'REAL'),
            ('down_prev', 'REAL'),
            ('up_prev', 'REAL'),
            ('chair_post', 'REAL'),
            ('bath_post', 'REAL'),
            ('down_post', 'REAL'),
            ('up_post', 'REAL'),
            ('a_prev', 'REAL'),
            ('a_post', 'REAL'),
            ('tag', ['walk', 'chair', 'bath', 'down', 'up'])
        ]
    }

    sql = """select * from motion order by event_timestamp asc;"""
    m.execute(sql)
    data = []
    counter = 0
    for record in m:
        # print(record)
        row = []
        ts = float(record[5])
        prev = get_prev_obj(ts)
        post = get_post_obj(ts)
        for item in prev:
            row.append(item)
        for item in post:
            row.append(item)
        a = get_a(ts)
        for item in a:
            row.append(item)
        if record[-1] is None:
            row.append('?')
        else:
            row.append(record[-1])
        data.append(row)
        counter += 1
        print(counter)
    dataset['data'] = data
    ou.write(arff.dumps(dataset))
    ou.close()
Ejemplo n.º 37
0
    def write_cv(self, splits):
        '''
            write cv.arff
        '''
        content = {
            "relation":
            "CV_Folds",
            "attributes": [("instance_id", "STRING"),
                           ("repetition", "NUMERIC"), ("fold", "NUMERIC")],
            "data": []
        }

        for split, idx in zip(splits, range(1, self.FOLDS + 1)):
            for inst_name in split:
                content["data"].append([inst_name, 1, idx])

        print(arff.dumps(content))
Ejemplo n.º 38
0
    def to_filesystem(self, output_directory, store_model=True):
        """
        The inverse of the from_filesystem method. Serializes a run
        on the filesystem, to be uploaded later.

        Parameters
        ----------
        output_directory : str
            a path leading to the folder where the results
            will be stored. Should be empty

        store_model : bool
            if True, a model will be pickled as well. As this is the most
            storage expensive part, it is often desirable to not store the
            model.
        """
        if self.data_content is None or self.model is None:
            raise ValueError(
                'Run should have been executed (and contain model / predictions)'
            )

        try:
            os.makedirs(output_directory)
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise e

        if not os.listdir(output_directory) == []:
            raise ValueError('Output directory should be empty')

        run_xml = self._create_description_xml()
        predictions_arff = arff.dumps(self._generate_arff_dict())

        with open(os.path.join(output_directory, 'description.xml'), 'w') as f:
            f.write(run_xml)
        with open(os.path.join(output_directory, 'predictions.arff'),
                  'w') as f:
            f.write(predictions_arff)
        if store_model:
            with open(os.path.join(output_directory, 'model.pkl'), 'wb') as f:
                pickle.dump(self.model, f)

        if self.trace is not None:
            self.trace._to_filesystem(output_directory)
Ejemplo n.º 39
0
 def write_cv(self, splits):
     '''
         write cv.arff
     '''
     content = {"relation" : "CV_Folds",
                "attributes": [
                               ("instance_id", "STRING"),
                               ("repetition", "NUMERIC"),
                               ("fold", "NUMERIC")
                               ],
                "data": []
                }
     
     for split, idx in zip(splits, range(1, self.FOLDS+1)):
         for inst_name in split:
             content["data"].append([inst_name, 1, idx])
             
     print(arff.dumps(content))
Ejemplo n.º 40
0
def dump_dataset(features, feature_format, evaluations, train_context,
                 format='arff',
                 positive_class=None):
    if format == 'arff':
        data = {
            'attributes': [],
            'data': [],
            'description': '',
            'relation': 'default'
        }
        n_features = len(features.splitlines())
        for i in range(1, n_features + 1):
            feature = ('f%d' % i, ['+', '-'])
            data['attributes'].append(feature)

        target = train_context.target_table
        if not target in train_context.orng_tables:
            raise Exception('Target table is not preloaded in memory! Please select the `dump data` parameter in the converter widget.')
        if feature_format == 'aleph':
            target_vals = ('negative', 'positive')
        else:
            orng_target = train_context.orng_tables[target]
            target_vals = tuple(sorted(orng_target.domain.classVar.values))
        class_attr = ('class', target_vals)
        data['attributes'].append(class_attr)
        for line in evaluations.splitlines():
            values = line.strip().split()
            if feature_format == 'aleph':
                class_val = values[-1]
                if class_val == positive_class:
                    values[-1] = 'positive'
                else:
                    values[-1] = 'negative'
            data['data'].append(values)
        return arff.dumps(data)

    elif format == 'csv':
        data = ''
        for line in evaluations.splitlines():
            values = line.strip().split()
            data = data + ','.join(values) + '\n'
        return data

    return 'unsupported format'
Ejemplo n.º 41
0
    def to_filesystem(self, output_directory, store_model=True):
        """
        The inverse of the from_filesystem method. Serializes a run
        on the filesystem, to be uploaded later.

        Parameters
        ----------
        output_directory : str
            a path leading to the folder where the results
            will be stored. Should be empty

        store_model : bool
            if True, a model will be pickled as well. As this is the most
            storage expensive part, it is often desirable to not store the
            model.
        """
        if self.data_content is None or self.model is None:
            raise ValueError('Run should have been executed (and contain model / predictions)')

        try:
            os.makedirs(output_directory)
        except OSError as e:
            if e.errno == errno.EEXIST:
                pass
            else:
                raise e

        if not os.listdir(output_directory) == []:
            raise ValueError('Output directory should be empty')

        run_xml = self._create_description_xml()
        predictions_arff = arff.dumps(self._generate_arff_dict())

        with open(os.path.join(output_directory, 'description.xml'), 'w') as f:
            f.write(run_xml)
        with open(os.path.join(output_directory, 'predictions.arff'), 'w') as f:
            f.write(predictions_arff)
        if store_model:
            with open(os.path.join(output_directory, 'model.pkl'), 'wb') as f:
                pickle.dump(self.model, f)

        if self.trace is not None:
            self.trace._to_filesystem(output_directory)
Ejemplo n.º 42
0
def export_arff(file, export_file, conf):
  """Takes a JSON list of incidents, processes them, and exports an ARFF file"""
  conf = json.load(open(conf))
  data = prepare.prepare(file, conf)

  # Structure of export
  export_data = {
    'attributes': [],
    'data': [],
    'relation': 'TrafficData'
  }

  paths = [jp_parse(path[1]) for path in conf['attributes']]

  # print data[0]
  for incident in data:
    entry = []
    for path in paths:
      results = path.find(incident)
      if len(results) > 0:
        if isinstance(results[0].value, basestring):
          entry.append(results[0].value.replace(" ", "-"))
        else:
          entry.append(results[0].value)
      else:
        entry.append(None)
    export_data['data'].append(entry)

  for index,attr in enumerate(conf['attributes']):
    if attr[2] in ['NUMERIC', 'REAL', 'INTEGER']:
      export_data['attributes'].append((attr[0], attr[2]))
    else:
      # Otherwise assume discrete
      vals = list(set([incident[index] for incident in export_data['data']]))
      export_data['attributes'].append((attr[0], vals))

  export_data['description'] = "\n".join(
    [str(datetime.datetime.now()), 
    json.dumps(conf, indent=2, separators=(',', ': '))])

  with open(export_file, "w") as f:
    f.write(arff.dumps(export_data))
Ejemplo n.º 43
0
    def save_to_arff(cls, X, y, endian = "big", save_sparse = False):
        X = X.todok()
        y = y.todok()
        
        x_prefix = 0
        y_prefix = 0

        x_attributes = [(u'X{}'.format(i),u'NUMERIC') for i in xrange(X.shape[1])]
        y_attributes = [(u'y{}'.format(i), [unicode(0),unicode(1)]) for i in xrange(y.shape[1])]

        if endian == "big":
            y_prefix = X.shape[1]
            relation_sign = -1
            attributes = x_attributes + y_attributes

        elif endian == "little":
            x_prefix = y.shape[1]
            relation_sign = 1
            attributes = y_attributes + x_attributes 

        else:
            raise ValueError("Endian not in {big, little}")

        if save_sparse:
            data = [{} for r in xrange(X.shape[0])]
        else:
            data = [[0 for c in xrange(X.shape[1] + y.shape[1])] for r in xrange(X.shape[0])]
        
        for keys, value in X.iteritems():
            data[keys[0]][x_prefix + keys[1]] = value

        for keys, value in y.iteritems():
            data[keys[0]][y_prefix + keys[1]] = value

        dataset = {
            u'description': u'traindata',
            u'relation': u'traindata: -C {}'.format(y.shape[1] * relation_sign),
            u'attributes': attributes,                
            u'data': data
        }

        return arff.dumps(dataset)
def ARFFCreation():

	dataSet = ARFFDataset(Stripped, nonStripped)

	attList = [
	('Gazetteer', ['TRUE', 'FALSE']),
	('CapitalLetter', ['TRUE', 'FALSE']),
	('Preposition', ['TRUE', 'FALSE']),
	('FollowingWord', ['TRUE', 'FALSE']),
	('Place', ['yes', 'no'])
	]

	obj = {
		'description': u'',
		'relation': 'PlaceNames',
		'attributes': attList,
		'data': dataSet,
	}

	with open('.\CreatedCSVs\TestData.arff', 'a') as f:
		f.write(arff.dumps(obj))
Ejemplo n.º 45
0
    def save_to_arff(cls, X, y, endian="little", save_sparse=True):
        """Method for loading ARFF files as numpy array

        Parameters
        ----------

        filename : string
            Path to ARFF file

        labelcount: integer
            Number of labels in the ARFF file

        endian: string{"big", "little"}
            Whether the ARFF file contains labels at the beginning of the attributes list ("big" endianness, MEKA format) 
            or at the end ("little" endianness, MULAN format)

        input_feature_type: numpy.type as string
            The desire type of the contents of the return 'X' array-likes, default 'i8', 
            should be a numpy type, see http://docs.scipy.org/doc/numpy/user/basics.types.html

        encode_nominal: boolean
            Whether convert categorical data into numeric factors - required for some scikit classifiers that can't handle non-numeric input featuers.

        save_sparse: boolean
            Whether to read arff file as a sparse file format, liac-arff breaks if sparse reading is enabled for non-sparse ARFFs.

        Returns
        -------

        data: dictionary {'X': scipy sparse matrix with input_feature_type elements, 'y': scipy sparse matrix of binary (int8) label vectors }
            The dictionary containing the data frame, with 'X' key storing the input space array-like of input feature vectors
            and 'y' storing labels assigned to each input vector, as a binary indicator vector (i.e. if 5th position has value 1
            then the input vector has label no. 5)

        """
        X = X.todok()
        y = y.todok()

        x_prefix = 0
        y_prefix = 0

        x_attributes = [(u'X{}'.format(i), u'NUMERIC')
                        for i in xrange(X.shape[1])]
        y_attributes = [(u'y{}'.format(i), [unicode(0), unicode(1)])
                        for i in xrange(y.shape[1])]

        if endian == "big":
            y_prefix = X.shape[1]
            relation_sign = -1
            attributes = x_attributes + y_attributes

        elif endian == "little":
            x_prefix = y.shape[1]
            relation_sign = 1
            attributes = y_attributes + x_attributes

        else:
            raise ValueError("Endian not in {big, little}")

        if save_sparse:
            data = [{} for r in xrange(X.shape[0])]
        else:
            data = [[0 for c in xrange(X.shape[1] + y.shape[1])]
                    for r in xrange(X.shape[0])]

        for keys, value in X.iteritems():
            data[keys[0]][x_prefix + keys[1]] = value

        for keys, value in y.iteritems():
            data[keys[0]][y_prefix + keys[1]] = value

        dataset = {
            u'description': u'traindata',
            u'relation': u'traindata: -C {}'.format(y.shape[1] * relation_sign),
            u'attributes': attributes,
            u'data': data
        }

        return arff.dumps(dataset)
Ejemplo n.º 46
0
			if(surgeryType != ""):
				surgeryCount += 1
			elif(radiationType != ""):
				radiationCount += 1
			
			behavior = line[223] # BEHAVIOR RECODE FOR ANALYSIS
			histology = line[225:227] # HISTOLOGY RECODE - BROAD GROUPINGS
			
			causeOfDeath = line[271] # SEER OTHER CAUSE OF DEATH CLASS
			survivalMonths = line[305:309] # Survival months - presumed alive
		
			data['data'].append([type, maritalStatus, race, tumorSize, survivalMonths])

	
	# Write arff
	arffContents = arff.dumps(data)
	output = open('cancer.arff', 'w')
	output.write(arffContents)
	
	
	# Count things
	print "Number of records: " + str(numLines)
	print "Number of patients: " + str(len(patientIDs))

	numRecordCounts = {}
	for patientID in patientIDs :
		if(patientIDs[patientID] in numRecordCounts):
			numRecordCounts[patientIDs[patientID]] += 1
		else:
			numRecordCounts[patientIDs[patientID]] = 1
	print "Number of patients with X number of records: "
Ejemplo n.º 47
0
def hbp_submit_search_criteria(input_dict):
    import orange
    import tempfile
    import arff
    import os

    dset = arff.load(open(os.path.dirname(os.path.abspath(__file__)) + "/new_adni.arff", "rb"))

    b = {}
    query = input_dict["query"]

    b["attributes"] = []

    if query["classification"] != None:
        b["attributes"].append(("Classification", [u"CN", u"AD", u"LMCI", u"EMCI", u"SMC"]))
    else:
        query["classification"] = [None]

    if query["geo"] != None:
        b["attributes"].append(
            ("geo", [u"Africa", u"Asia", u"Australia", u"Europe", u"North America", u"South America"])
        )
    else:
        query["geo"] = [None]

    if query["age"] != None:
        b["attributes"].append(("age", [u"1-18", u"18-24", u"25-34", u"35-44", u"45-54", u"55-64", u"65+"]))
    else:
        query["age"] = [None]

    fields = []
    for c in query["classification"]:
        for g in query["geo"]:
            for a in query["age"]:
                fields.append((c, g, a))

    b["attributes"].append(("count", "NUMERIC"))

    b["description"] = ""
    b["relation"] = "HBP"
    b["data"] = []

    for field in fields:
        d = []
        if field[0] != None:
            d.append(field[0])
        if field[1] != None:
            d.append(field[1])
        if field[2] != None:
            d.append(field[2])
        counts = 0
        for i in dset["data"]:
            if (
                (i[-1] == field[0] or field[0] == None)
                and (i[0] == field[2] or field[2] == None)
                and (i[1] == field[1] or field[1] == None)
            ):
                counts = counts + 1

        d.append(counts)

        b["data"].append(d)

    # a = arff.load_data()
    f = tempfile.NamedTemporaryFile(delete=False, suffix=".arff")
    f.write(arff.dumps(b))
    f.close()
    output_dict = {}
    output_dict["results"] = orange.ExampleTable(f.name)
    return output_dict
Ejemplo n.º 48
0
    def handle(self, *args, **options):
        params = json.load(open(args[0]))

        source_name = params['database']
        
        label_table_name = None
        label_column_name = None
        
        for key,value in params['label'].iteritems():
            label_table_name = key
            label_column_name = value
        
        # Add parameters for setting ranges...
        
        past = datetime.datetime(datetime.MINYEAR, 1, 1)
        future = datetime.datetime(datetime.MAXYEAR, 12, 31)
        
        for ds in DataSource.objects.filter(name__contains=source_name):
            table_names = ds.table_names()

            table_columns = {}
            
            for name in table_names:
                table_columns[name] = ds.table_columns(name)
                
#            print(json.dumps(table_columns, indent=2))
        
            points = ds.fetch_data(label_table_name, label_column_name, past, future)
            
            label_value_name = label_table_name + '_' + label_column_name
            
            categorical_values = {}
            rows = []
            row_keys = []
            
            for point in points:
                row_dict = {}
                
                point_time = point[0]
                label_value = point[1]

                row_dict[label_value_name] = label_value
                
                for table, columns in table_columns.iteritems():
                    fetched = ds.fetch_nearest(point_time, table, columns)
                    
                    if len(fetched) > 0:
                        for i in range(0, len(columns)):
                            column = columns[i]
                            
                            column_key = table + '_' + column[0]
                            
                            if row_keys.count(column_key) == 0:
                                row_keys.append(column_key)
                            
                            if column[1] == 'text' or column[1] == 'boolean':
                                column_values = set(['?'])

                                try:
                                    column_values = categorical_values[column_key]
                                except KeyError:
                                    categorical_values[column_key] = column_values
                                
                                column_values.add(slugify(unicode(str(fetched[i]))))
                                
                            if fetched[i] == True or fetched[i] == False:
                                row_dict[column_key] = slugify(unicode(str(fetched[i])))
                            elif column[1] == 'text':
                                row_dict[column_key] = slugify(unicode(fetched[i]))
                            else:
                                row_dict[column_key] = fetched[i]
                        
                rows.append(row_dict)

            row_keys.sort()            
                
            data = { 'relation': label_table_name + '_' + label_column_name, 'description': '' }
            
            attributes = []
            
            ignore = []

            for row_key in row_keys:
                value_def = 'REAL'

                if row_key in categorical_values:
                    value_def = []
                    
                    for value in categorical_values[row_key]:
                        value_def.append(value)
                
                if value_def == 'REAL' or len(value_def) > 1:
                    attributes.append((row_key, value_def))
                else:
                    ignore.append(row_key)
                
            data['attributes'] = attributes
            
            data_rows = []
            
            for row_dict in rows:
                data_row = []
                
                for row_key in row_keys:
                    if ignore.count(row_key) == 0:
                        try:
                            data_row.append(row_dict[row_key])
                        except KeyError:
                            data_row.append(None)
                
                data_rows.append(data_row)
            
            data['data'] = data_rows
            
            print('dumping....')
            
            print(arff.dumps(data))

            print('done.')
Ejemplo n.º 49
0
def write_to_arff(data, filename):
    with open(filename, 'w') as f:
        f.write(arff.dumps(data))
def main():
	lno=1
	word_tot=0
	corr=0
	init_corr=0
	beng_statistics = {}
	beng_statistics = ng.ngram_prof("./beng_train.txt",beng_statistics)
	eng_statistics = {}

	'''	
	text = " ".join(brown.words())
	tokenizer = RegexpTokenizer("[a-zA-Z'`]+")
	text = tokenizer.tokenize(text)
	text = " ".join(text)
	brown_words=open("./brown_words.txt",'w')
	brown_words.write(text)
	brown_words.close()
	'''

	eng_statistics = ng.ngram_prof("./brown_words.txt",eng_statistics)

	lang_stats={}
	lang_stats.update({'e':eng_statistics})
	lang_stats.update({'b':beng_statistics})


	#fin=open("./beng_corpus.txt",'r')
	fin=open("./BanglaEnglish_FIRE2013_AnnotatedDev.txt",'r')
	#fout_pred=open("./predicted_tags_arff.txt",'w')
	#fout_corr=open("./corrected_tags_arff.txt",'w')
	word_list=[]
	eng_dic=[]
	beng_dic=[]
	ngram=[]
	surround=[]
	corr_tag=[]
	data=[]
	sent=fin.readline()
	while(sent):
		############ Only for Facebook corpus (COMMENT for FIRE)
		#sent = re.sub(r'[^\w\s]','',sent)
		##########################################
		words=[]
		sent=sent.split()
		for elem in sent:
			############ Only for FIRE CORPUS (COMMENT for Facebook)
			elem=elem.split('\\')
			corr_tag.append(elem[1][0])
			elem=elem[0]
			#################################
			elem.strip()
			words.append(elem)
		type_map = defaultdict(str)
		type_count = defaultdict(int)
		word_count=0

		for word in words:
			word=word.strip(" ")
			word_list.append(word)
			word_count+=1
			beng_rat=0
			eng_rat=0
			if(beng.beng_word(word)):
				beng_rat=1
			if(eng.eng_search(word)):
				eng_rat=1

			############ REVERT 1/6
			#eng_dic.append(eng_rat)
			#beng_dic.append(beng_rat)
			##############

			word_statistics={}
			lang_ratio = {}

			grams = ngrams(word,4,pad_left=True,pad_right=True,left_pad_symbol=' ',right_pad_symbol=' ')
			grams=list(grams)
			ng_list=[]
			for j in range (len(grams)):
				ng_list.append(''.join(grams[j]))
			word_statistics = ng.ngram_hash(ng_list,word_statistics)
			word_statistics = sorted(word_statistics.items(), key=operator.itemgetter(1), reverse=True)
			for lang, ngrams_statistics in lang_stats.items():
				distance = ng.compare_ng_prof(ngrams_statistics,word_statistics)
				lang_ratio.update({lang:distance})
			ng_lang_result = min(lang_ratio, key = lang_ratio.get).upper()

			ngram.append(ng_lang_result)

			if(beng_rat and not eng_rat):
				############# REVERT 2/6
				beng_dic.append('2')
				###################
				type_word="Bengali word"
			elif(eng_rat and not beng_rat):
				############# REVERT 3/6
				beng_dic.append('0')
				###################
				type_word="English word"
			else:
				############# REVERT 4/6
				beng_dic.append('1')
				###################
				if(ng_lang_result=='B'):
					type_word="Bengali word"
				else:
					type_word="English word"

			type_map[word]=type_word
			type_count[type_word]+=1

			#print(str(word)+"(Detect:"+str(type_map[word])+")")
			
		if((type_count["English word"])>(type_count["Bengali word"])):
			default="e"
		else:
			default="b"
		print(str(lno)+default)
		lno+=1
		for i in range(len(words)):
			word_count=0
			type_count["English word"]=0
			type_count["Bengali word"]=0

			if(i>1 and i<(len(words)-2)):
				word_count=4
				for j in range(i-2,min((len(words)-1),(i+3))):
					if(j!=i):
						type_count[type_map[words[j]]]+=1
			elif (i<=1):
				word_count=min(i+2,len(words)-1)
				for j in range(min((len(words)-1),(i+3))):
					if(j!=i):
						type_count[type_map[words[j]]]+=1
			elif(i>=(len(words)-2)):
				word_count= (len(words)-i)+1
				for j in range(i-2,len(words)):
					if(j!=i):
						type_count[type_map[words[j]]]+=1
			if(word_count):
				surround.append(str(type_count["Bengali word"]/word_count))
				#if((type_count["Bengali word"]/word_count)>=0.5):
				#	surround.append('1')
				#else:
				#	surround.append('0')
			else:
				surround.append('0')
		sent=fin.readline()

	
	############ Only for Facebook corpus (COMMENT for FIRE)
	#real_tag = open("./real_tags.txt",'r')
	#line = real_tag.readline().split()
	#for word in line:
	#	tags=word.split('\\')
	#	corr_tag.append(tags[1])
	#real_tag.close()
	######################################
	

	for i in range(len(word_list)):
		vector=[]
#		vector.append(word_list[i])
		########REVERT 5/6
		#vector.append(int(eng_dic[i]))
		##############
		vector.append(int(beng_dic[i]))
		vector.append(ngram[i])
		vector.append(float(surround[i]))
		vector.append(corr_tag[i])
		data.append(vector)

	obj={
	'description': u'',
	'relation': 'langid',
	'attributes': [
	#############REVERT 6/6
	#('eng_dict','NUMERIC'),
	####################
	('beng_dict','NUMERIC'),
	('ngram','STRING'),
	('beng_surr','REAL'),
	('real_tag','STRING')
	],
	'data':data,

	}

	final = arff.dumps(obj)

	#FIRE
	final_file=open("./lang_id_fire.arff",'w')
	#
	#Facebook
	#final_file=open("./lang_id.arff",'w')
	#
	final_file.write(final)
	final_file.close()

	fin.close()
Ejemplo n.º 51
0
import arff
import pickle

attributes = []
nFFT = 64
baseStr = "FFT_"
for i in range(nFFT):
    name = baseStr + str(i)
    attributes.append((name,u'REAL'))
attributes.append(("Ang",u'REAL'))
#print(attributes)
relation = "Dist_FFT"

dataF = []
samples = 120
for i in range(samples):
    filename = str(i+1) + ".p"
    f = open(filename,"rb")
    p = pickle.load(f)
    f.close()
    p.pop(-2)
    dataF.append(p)

data = {u'attributes': attributes, u'data': dataF, u'description': u'', u'relation': relation}

f = open("myData2.arff","w")
f.write(arff.dumps(data))
f.close()
Ejemplo n.º 52
0
    def test_encode_source(self):
        obj = arff.loads(ARFF_SOURCE)
        result = arff.dumps(obj)
        expected = ARFF_DESTINY

        self.assertEqual(result, expected)
Ejemplo n.º 53
0
def save_to_arff(X, y, label_location="end", save_sparse=True, filename=None):
    """Method for dumping data to ARFF files

    Parameters
    ----------
    X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
        input feature matrix
    y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
        binary indicator matrix with label assignments
    label_location: string {"start", "end"} (default is "end")
        whether the ARFF file will contain labels at the beginning of the
        attributes list ("start", MEKA format)
        or at the end ("end", MULAN format)
    save_sparse: boolean
        Whether to save in ARFF's sparse dictionary-like format instead of listing all
        zeroes within file, very useful in multi-label classification.
    filename : str or None
        Path to ARFF file, if None, the ARFF representation is returned as string
    Returns
    -------
    str or None
        the ARFF dump string, if filename is None
    """
    X = X.todok()
    y = y.todok()

    x_prefix = 0
    y_prefix = 0

    x_attributes = [(u'X{}'.format(i), u'NUMERIC')
                    for i in range(X.shape[1])]
    y_attributes = [(u'y{}'.format(i), [str(0), str(1)])
                    for i in range(y.shape[1])]

    if label_location == "end":
        y_prefix = X.shape[1]
        relation_sign = -1
        attributes = x_attributes + y_attributes

    elif label_location == "start":
        x_prefix = y.shape[1]
        relation_sign = 1
        attributes = y_attributes + x_attributes

    else:
        raise ValueError("Label location not in {start, end}")

    if save_sparse:
        data = [{} for r in range(X.shape[0])]
    else:
        data = [[0 for c in range(X.shape[1] + y.shape[1])]
                for r in range(X.shape[0])]

    for keys, value in list(X.items()):
        data[keys[0]][x_prefix + keys[1]] = value

    for keys, value in list(y.items()):
        data[keys[0]][y_prefix + keys[1]] = value

    dataset = {
        u'description': u'traindata',
        u'relation': u'traindata: -C {}'.format(y.shape[1] * relation_sign),
        u'attributes': attributes,
        u'data': data
    }

    arff_data = arff.dumps(dataset)

    if filename is None:
        return arff_data

    with open(filename, 'w') as fp:
        fp.write(arff_data)
Ejemplo n.º 54
0
def hbp_interactive_analysis_post(postdata, input_dict, output_dict):
    import orange
    import tempfile
    import arff
    import os
    import numpy

    d = input_dict["Dataset"]

    dset = arff.load(open(os.path.dirname(os.path.abspath(__file__)) + "/new_adni.arff", "rb"))

    b = {}
    b["attributes"] = []

    feature_names = [x.name for x in d.domain]

    if "Classification" in feature_names:
        b["attributes"].append(("Classification", [u"CN", u"AD", u"LMCI", u"EMCI", u"SMC"]))

    if "geo" in feature_names:
        b["attributes"].append(
            ("geo", [u"Africa", u"Asia", u"Australia", u"Europe", u"North America", u"South America"])
        )

    if "age" in feature_names:
        b["attributes"].append(("age", [u"1-18", u"18-24", u"25-34", u"35-44", u"45-54", u"55-64", u"65+"]))

    for v in postdata["variables"]:
        b["attributes"].append((v + "_avg", "NUMERIC"))
        b["attributes"].append((v + "_stdev", "NUMERIC"))

    b["attributes"].append(("count", "NUMERIC"))

    b["description"] = ""
    b["relation"] = "HBP"
    b["data"] = []

    for i in d:
        new_i = []
        if "Classification" in feature_names:
            clas = i[feature_names.index("Classification")].value
            new_i.append(clas)
        else:
            clas = None

        if "geo" in feature_names:
            geo = i[feature_names.index("geo")].value
            new_i.append(geo)
        else:
            geo = None

        if "age" in feature_names:
            age = i[feature_names.index("age")].value
            new_i.append(age)
        else:
            age = None

        sums = {}
        for v in postdata["variables"]:
            sums[v] = 0

        vcounts = {}
        for v in postdata["variables"]:
            vcounts[v] = 0

        vvalues = {}
        for v in postdata["variables"]:
            vvalues[v] = []

        count = 0

        attrs = [x[0] for x in dset["attributes"]]

        for ins in dset["data"]:
            if (ins[-1] == clas or clas == None) and (ins[0] == geo or geo == None) and (ins[1] == age or age == None):
                count = count + 1
                for k, v in sums.items():
                    try:
                        sums[k] = v + ins[attrs.index(k)]
                        vcounts[k] = vcounts[k] + 1
                        vvalues[k].append(v)
                    except:
                        pass

        for v in postdata["variables"]:
            print vvalues[v]
            stdev = numpy.std(vvalues[v])
            try:
                avg = sums[v] / vcounts[v]
            except:
                avg = -2
            new_i.append(avg)
            new_i.append(stdev)
        new_i.append(count)
        b["data"].append(new_i)

    f = tempfile.NamedTemporaryFile(delete=False, suffix=".arff")
    f.write(arff.dumps(b))
    f.close()
    output_dict = {}
    output_dict["results"] = orange.ExampleTable(f.name)
    return output_dict
Ejemplo n.º 55
0
def csv_to_arff(fileinput, type_list, relation_name, selected_attrs):
    with open(fileinput, 'r') as inputfile:
        data = csv.reader(inputfile, delimiter=',')
        arff_content = data_to_dict(data, type_list, relation_name, selected_attrs)
        return arff.dumps(arff_content)
Ejemplo n.º 56
0
        #(u'Firstyrcumgpa', u'REAL')
    ]
if isTestSet:
    template_attr = []

# Creat ARFF Template
template = {
    u'attributes': template_attr,
    u'data': data, # list
    u'description': u'',
    u'relation': u'admission_stats'
}

# Save ARFF FILE
with open(output_arffFile, 'w') as arffFile:
    arffFile.write(arff.dumps(template))

print 'Finished Classifying...'

input_arff = output_arffFile

if not useConverts:
    with open(input_arff, 'r+') as af:
        arffFile = arff.load(af)
        data = arffFile['data']
        datalist = []
        for row in data:
            replaceByConversion(row)
            datalist.append(row)
        print 'Finished Converting...'
        arffFile['data'] = datalist
Ejemplo n.º 57
0
    def handle(self, *args, **options):
        count = 0
        
        for job in ReportJob.objects.filter(job_start=None):
            params = json.loads(job.parameters)

            source_name = params['database']
            
            label_table_name = None
            label_column_name = None
            
            for key,value in params['label'].iteritems():
                label_table_name = key
                label_column_name = value
                
            job.job_start = datetime.datetime.now()
            job.save()
            
            # Add parameters for setting ranges...
            
            past = datetime.datetime(datetime.MINYEAR, 1, 1)
            future = datetime.datetime(datetime.MAXYEAR, 12, 31)
            
            for ds in DataSource.objects.filter(name__contains=source_name):
                table_names = ds.table_names()
    
                table_columns = {}
                
                for name in table_names:
                    table_columns[name] = ds.table_columns(name)
            
                points = ds.fetch_data(label_table_name, label_column_name, past, future)
    
                label_value_name = label_table_name + '_' + label_column_name
                
                categorical_values = {}
                rows = []
                row_keys = []
                
                for point in points:
                    row_dict = {}
                    
                    point_time = point[0]
                    label_value = point[1]

                    row_dict[label_value_name] = label_value
                    
                    for table, columns in table_columns.iteritems():
                        fetched = ds.fetch_nearest(point_time, table, columns)
                        
                        if len(fetched) > 0:
                            for i in range(0, len(columns)):
                                column = columns[i]
                                
                                column_key = slugify(table + '_' + column[0])
                                
                                if row_keys.count(column_key) == 0:
                                    row_keys.append(column_key)
                                
                                if column[1] == 'text' or column[1] == 'boolean':
                                    column_values = set([None])
    
                                    try:
                                        column_values = categorical_values[column_key]
                                    except KeyError:
                                        categorical_values[column_key] = column_values
                                    
                                    str_value = slugify(unicode(str(fetched[i])))
                                    
                                    if str_value.strip() == '':
                                        str_value = 'empty_string'
                                    
                                    column_values.add(str_value)
                                    
                                if fetched[i] == True or fetched[i] == False:
                                    row_dict[column_key] = slugify(unicode(str(fetched[i])))
                                elif column[1] == 'text':
                                    str_value = slugify(unicode(fetched[i]))
                                    
                                    if str_value.strip() == '':
                                        str_value = 'empty_string'
                                    
                                    row_dict[column_key] = str_value
                                else:
                                    row_dict[column_key] = fetched[i]
                            
                    rows.append(row_dict)
    
                row_keys.sort()            
                    
                data = { 'relation': slugify(label_table_name + '_' + label_column_name), 'description': '' }
                
                attributes = []
                
                ignore = []
    
                for row_key in row_keys:
                    value_def = 'REAL'
    
                    if row_key in categorical_values:
                        value_def = []
                        
                        for value in categorical_values[row_key]:
                            value_def.append(value)
                    
                    if ('_sensor_dt_' in row_key) == False and ('randomnoiseprobe' in row_key) == False and (value_def == 'REAL' or len(value_def) > 2 or (((None in value_def) == False) and len(value_def) > 1)):        
                        attributes.append((row_key, value_def))
                    else:
                        ignore.append(row_key)
                    
                data['attributes'] = attributes
                
                data_rows = []
                
                for row_dict in rows:
                    data_row = []
                    
                    for row_key in row_keys:
                        if ignore.count(row_key) == 0:
                            try:
                                data_row.append(row_dict[row_key])
                            except KeyError:
                                data_row.append(None)
                    
                    data_rows.append(data_row)
                
                data['data'] = data_rows
                job.result_file.save(str(job.pk) + '_' + str(uuid.uuid4()), ContentFile(arff.dumps(data)))
                job.result_type = 'text/arff'

            job.job_end = datetime.datetime.now()
            job.save()
            
            count += 1
            
        print(str(count) + ' job(s) run.')