コード例 #1
0
def generate_results(res, classes_name):
    """
    Converts the predictions to csv and creates the 'output.csv' file in the resources folder
    :param res:
    :param classes_name:
    :return: void
    """
    toOutput = []
    for i in range(len(res)):
        toOutput.append({'Id': i, 'Category': classes_name[res[i]]})
    convert_to_csv(toOutput)
コード例 #2
0
def start():
    print "importing worldbank data..."
    db.insert("source", {"name": "World Bank"})
    utils.convert_to_csv(
        os.path.join("data", "worldbank", "IND_Country_MetaData_en_EXCEL.xls"),
        os.path.join("data", "worldbank"))

    # import dataset
    with open(
            os.path.join(
                "data", "worldbank",
                "IND_Country_MetaData_en_EXCEL-sheet2.csv")) as datafile:
        reader = csv.reader(datafile.read().splitlines())

    for i, row in enumerate(reader):
        if i == 0:
            continue
        row = [unicode(c, "utf-8", errors="ingore") for c in row]
        db.insert_dataset({
            "name": row[1][:150],
            "title": row[1],
            "description": row[2],
            "source_info": row[3],
            "source": "World Bank"
        })

    # import data
    with open(
            os.path.join(
                "data", "worldbank",
                "IND_Country_MetaData_en_EXCEL-sheet1.csv")) as datafile:
        reader = csv.reader(datafile.read().splitlines())

    db.insert("region", {"name": "India"})

    for i, row in enumerate(reader):
        if i == 0:
            headers = row
            for year in row[2:]:
                db.insert("period", {"name": year})

        else:
            for ci, value in enumerate(row):
                if ci > 1 and utils.flt(value):
                    db.insert(
                        "data", {
                            "dataset": row[0],
                            "period": headers[ci],
                            "value": value,
                            "region": "India",
                        })
            if i % 100 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
コード例 #3
0
def download():
	properties.load_properties()

	for i in range(61)[39:]:
		print "for page %s" % i
		response = requests.get(sources[0] + "?page=%s" % i, verify=False)
		page_properties = get_url_title_and_description_from_html(response.text)
		
		for filename in page_properties:
			filepath = os.path.join("data", "data.gov.in", filename)
			if not os.path.exists(filepath):
				try:
					url = urllib2.urlopen(page_properties[filename]["url"])
					size = url.headers["Content-Length"]
					if int(size) < int(max_file_size):
						with open(filepath, "wb") as datafile:
							r = requests.get(page_properties[filename]["url"])
							for chunk in r.iter_content(1024):
								datafile.write(chunk)
					else:
						print "[ignored] [too big] %s (%s)" % (filename, size)
				except urllib2.HTTPError, e:
					print e
			
			if os.path.exists(filepath):
				files = convert_to_csv(filepath, os.path.join("data", "csv"))
				for fpath in files:
					prepend_property_headers(fpath, page_properties[filename])
    def testfile(self, test_records):
        """Returns a filename with test data saved."""

        tmpfile = NamedTemporaryFile()
        assert os.path.exists(tmpfile.name)

        with open(tmpfile.name, 'w') as tmpfilewriter:
            data_csv = convert_to_csv(test_records)
            tmpfilewriter.write(data_csv)
        return tmpfile
コード例 #5
0
    def save(self, records=None):
        """Saves records to slice file."""
        records_list = list()
        if records is not None:
            records_list = records
        else:
            if not self._records:
                return
            for channeled_records in self._records.values():
                records_list.extend(channeled_records)
            records_list = sorted(records_list, key=lambda record: record[0])

        data_csv = convert_to_csv(records_list)
        blob = self._bucket.blob(self._filename)
        blob.upload_from_string(data_csv)
    def write_to_tmpfile(self, records):
        """Writes records to tmp file.

        Args:
            records: A list of records.

        Returns:
            An IO object.
        """
        tmpfile = NamedTemporaryFile()
        with open(tmpfile.name, 'w') as filewriter:
            data_csv = convert_to_csv(records)
            filewriter.write(data_csv)
        assert os.path.exists(tmpfile.name)
        return tmpfile
コード例 #7
0
    def write_to_tmpfile(self, records_to_be_written):
        """Writes records in a temperary file.

        Args:
            records_to_be_written: A list of records.

        Returns:
            A fileIO object for that temp file.
        """
        tmpfile = NamedTemporaryFile()
        assert os.path.exists(tmpfile.name)
        with open(tmpfile.name, 'w') as tmpfilewriter:
            data_csv = convert_to_csv(records_to_be_written)
            tmpfilewriter.write(data_csv)
        return tmpfile
コード例 #8
0
def download():
    sourcepath = os.path.join("app", "downloads", "data.gov.in")

    not os.path.exists(sourcepath) and os.makedirs(sourcepath)

    for i in range(415)[165:]:
        print "for page %s" % i
        response = requests.get(sources[0] + "?page=%s" % i, verify=False)
        page_properties = get_url_title_and_description_from_html(
            response.text)

        for filename, properties in page_properties.iteritems():
            filepath = os.path.join(sourcepath, filename)
            filecontent = ""
            print "downloading " + filename
            if not os.path.exists(filepath):
                try:
                    url = urllib2.urlopen(properties["url"].encode("utf-8"))
                    size = url.headers["Content-Length"]
                    if int(size) < int(max_file_size_in_kb * 1024):
                        r = requests.get(properties["url"])
                        with open(filepath, "w") as datafile:
                            for chunk in r.iter_content(1024):
                                datafile.write(chunk)
                    else:
                        print "[ignored] [too big] %s (%s)" % (filename, size)
                except urllib2.HTTPError, e:
                    print e

            if os.path.exists(filepath):
                if filepath.split(".")[-1] == "xls":
                    try:
                        files = convert_to_csv(filepath, sourcepath)

                        # remove orignal xls file (not needed)
                        os.remove(filepath)

                        # keep as a marker that this file is downloaded
                        os.system("touch %s" % filepath)
                    except Exception, e:
                        files = []
                        print e
                else:
                    files = [filepath]

            for fpath in files:
                prepend_property_headers(fpath, properties)
コード例 #9
0
def upload(filename, records):
    """Uploads records to bucket in the given file name and returns
    a bucket object.

    Args:
        filename: A string for name of the file.
        records: A list of records.

    Returns:
        GCS bucket object.
    """
    client = storage.Client()
    bucket = client.bucket(TEST_BUCKET)
    if records is not None:
        blob = bucket.blob(filename)
        blob.upload_from_string(convert_to_csv(records))
    return bucket
コード例 #10
0
def download():
	sourcepath = os.path.join("app", "downloads", "data.gov.in")

	not os.path.exists(sourcepath) and os.makedirs(sourcepath)
	
	for i in range(415)[165:]:
		print "for page %s" % i
		response = requests.get(sources[0] + "?page=%s" % i, verify=False)
		page_properties = get_url_title_and_description_from_html(response.text)
				
		for filename, properties in page_properties.iteritems():
			filepath = os.path.join(sourcepath, filename)
			filecontent = ""
			print "downloading " + filename
			if not os.path.exists(filepath):
				try:
					url = urllib2.urlopen(properties["url"].encode("utf-8"))
					size = url.headers["Content-Length"]
					if int(size) < int(max_file_size_in_kb * 1024):
						r = requests.get(properties["url"])
						with open(filepath, "w") as datafile:
							for chunk in r.iter_content(1024):
								datafile.write(chunk)
					else:
						print "[ignored] [too big] %s (%s)" % (filename, size)
				except urllib2.HTTPError, e:
					print e
			
			if os.path.exists(filepath):
				if filepath.split(".")[-1]=="xls":
					try:
						files = convert_to_csv(filepath, sourcepath)

						# remove orignal xls file (not needed)
						os.remove(filepath)

						# keep as a marker that this file is downloaded
						os.system("touch %s" % filepath)
					except Exception, e:
						files = []
						print e
				else:
					files = [filepath]
			
			for fpath in files:
				prepend_property_headers(fpath, properties)
コード例 #11
0
            if (predictions[i]['Category'] == result[i]):
                count += 1
        return count / len(predictions)

    def define_alpha(self, validation_comments, validation_result):
        """
        Helper function to find a good value for hyper param alpha
        """
        alpha = [0.00001, 0.00005, 0.0001, 0.0005, 0.001, 0.005, 0.01]
        result = np.zeros(len(alpha))
        for i in range(len(alpha)):
            print('Alpha ', i + 1, '/', len(alpha), ' : ', alpha[i])
            predict = bayes_classifier.predict(validation_comments, alpha[i])
            result[i] = bayes_classifier.score(predict, validation_result)
            print(result[i])
        print(result)
        print(alpha[np.argmax(result)])
        return alpha[np.argmax(result)]


if __name__ == "__main__":
    train_data = read_train_data()
    test_data = read_test_data()
    comment = train_data[0]
    result = train_data[1]
    bayes_classifier = BayesClassifier()
    alpha_star = 0.01
    bayes_classifier.train(comment, result)
    predictions = bayes_classifier.predict(test_data, alpha_star)
    convert_to_csv(predictions)
コード例 #12
0
 def test_convert_to_csv(self, test_records, test_csv_records):
     """Tests on convert_to_csv"""
     for length in range(len(test_records)):
         assert convert_to_csv(test_records[:length +
                                            1]) == test_csv_records[length]
コード例 #13
0
def generate_results(res, classes_name):
    toOutput = []
    for i in range(len(res)):
        toOutput.append({'Id': i, 'Category': classes_name[res[i]]})
    convert_to_csv(toOutput)