Example #1
0
def unzip_to_temp(zipurl, extract=True):
    """

    :param zipurl: url of MapPluto Zip file
    :return: temporary directory of unzipped shapefiles
    """

    dir = mkdtemp()

    with urlopen(zipurl) as zipresp:
        print(f"Downloading ZIPFile {zipurl}")
        with ZipFile(BytesIO(zipresp.read())) as zfile:
            if not extract:
                files = zfile.namelist()
                return files
            else:

                print("Unzipping file")
                zfile.extractall(dir)

    # does the shapefile contain more zipfiles?
    child_zips = list_all_files(dir, ['zip'])
    if len(child_zips) > 0:
        unzip_child(child_zips)

    return (dir)
Example #2
0
import utils


OSS_ENDPOINT = os.getenv('OSS_ENDPOINT')
OSS_AK = os.getenv('OSS_AK')
OSS_SK = os.getenv('OSS_SK')
OSS_BUCKET = os.getenv('OSS_BUCKET')
OSS_REMOTE_ROOT = os.getenv('OSS_REMOTE_ROOT', '')
SRC_ROOT = os.getenv('SRC_ROOT')
SKIP_UPLOADED = True if (os.getenv('SKIP_UPLOADED') == 'true') else False
assert OSS_ENDPOINT
assert OSS_AK
assert OSS_SK
assert SRC_ROOT

auth = oss2.Auth(OSS_AK, OSS_SK)
bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET)

if not SRC_ROOT.endswith('/'):
    SRC_ROOT += '/'

SRC_ROOT_LEN = len(SRC_ROOT)
for f in utils.list_all_files(SRC_ROOT):
    key = f[SRC_ROOT_LEN:]
    if OSS_REMOTE_ROOT:
        key = os.path.join(OSS_REMOTE_ROOT, key)
    logging.debug('start uploading file %s to %s', f, key)
    if SKIP_UPLOADED and bucket.object_exists(key):
        continue
    bucket.put_object_from_file(key, f)
Example #3
0
import openpyxl
from openpyxl import Workbook

from constant import REFERENCE_FILE, TEST_FILES, TEST_PDF_PATH

app = Flask('mongo')
app.config['MONGO_DBNAME'] = 'cpa_database'
app.config['MONGO_URI'] = 'mongodb://localhost:27017/cpa_database'

mongo = PyMongo(app)

#path = "/Users/shravanc/flask/flask_apps/cpa/UserSamples"

path = "/Users/shravanc/Desktop/cpa_report/good"
path = "/Users/shravanc/Desktop/CPA_files/new_certificates/data"
files = list_all_files(path)
#files = list_all_files(TEST_FILES)
upload_path = TEST_PDF_PATH  #"/Users/shravanc/flask/flask_apps/cpa_certificate_extraction/development/uploads"
print("ALL_FILES====>", files)

wb = Workbook()
"""
def update_excel_sheet(result, name):
		sheet = wb.create_sheet(name)

		for index, (key, value) in enumerate(result.items()):
				if index < 4:
						continue
				sheet.cell(row=1+index, column=1).value = str(key)
				sheet.cell(row=1+index, column=2).value = str(value)
Example #4
0
# Retrieve dataset files
url = 'https://github.com/hromi/SMILEsmileD/archive/master.zip'

print("Downloading dataset")
mx.test_utils.download(url)

zip_ref = zipfile.ZipFile('master.zip', 'r')
zip_ref.extractall()
zip_ref.close()

os.remove('master.zip')

# Load training images
negative_paths = list(
    list_all_files('SMILEsmileD-master/SMILEs/negatives/negatives7/',
                   ['.jpg']))
print('loaded', len(negative_paths), 'negative examples')
positive_paths = list(
    list_all_files('SMILEsmileD-master/SMILEs/positives/positives7/',
                   ['.jpg']))
print('loaded', len(positive_paths), 'positive examples')
examples = [(path, 0)
            for path in negative_paths] + [(path, 1)
                                           for path in positive_paths]


# Convert loaded images into numpy arrays
def examples_to_dataset(examples, size=32):
    X = []
    y = []
    for path, label in examples:
Example #5
0
# This parses poetry from www.poetryfoundation.org into files that have a title on the first line, author on the second, and the rest is the poem.

from bs4 import BeautifulSoup as bs
from utils.list_all_files import *
import hashlib

# get_ipython().system('mkdir -p output')


def get_hash(text):
    return hashlib.md5(text.encode('utf8')).hexdigest()


conditions = ['.o-article .c-feature-hd', '.c-txt_attribution a', '.o-poem']
for fn in list_all_files('www.poetryfoundation.org/'):
    with open(fn) as f:
        html = f.read()
        soup = bs(html, 'html.parser')
        results = [soup.select(e) for e in conditions]
        if all(results):
            title = results[0][0].text.strip().split('\n')[0]
            author = results[1][0].text.strip().split('\n')[0]
            poem = results[2][0].get_text('\n').strip().split('\n')
            poem = [e.strip() for e in poem if len(e.strip())]
            poem = '\n'.join(poem)
            output_fn = 'output/' + get_hash(title + author) + '.txt'

            if len(poem) < 100:
                print(f'Parsing error: {fn}')
                continue
Example #6
0
from shutil import rmtree
import os

client = storage.Client()
bucket = client.bucket('raw-pluto')

zip_links = pd.read_csv('./etc/zip_links.csv').loc[16:]

for index, row in zip_links.iterrows():



    print(f"make blob for {row.year}.csv")
    blob = bucket.blob(f"{row.year}.csv")
    dir = unzip_to_temp(row.path)
    shapes = list_all_files(dir, ['shp'])
    print("make pluto-year objects")
    py = PlutoYear(shapes)
    print("save py obj to tmp_data.csv")
    py.wkt_file.to_csv('tmp_data.csv', index = False)
    print("clean tmp dir")
    rmtree(dir)
    blob.upload_from_filename('./tmp_data.csv')
    os.remove('tmp_data.csv')


if __name__ == '__main__':
    pass


Example #7
0
 '“': '"',
 '”': '"'
}

def remove_special(text):
    return ''.join([mapping[e] if e in mapping else e for e in text])

def strip_word(word):
    word = re.sub('^\W*|\W*$', '', word).lower()
    return word

basenames = []
all_poems = {}
total_lines = 0
words = set()
for fn in list_all_files('../../scraping/poetry/output'):
    with open(fn) as f:
        original = open(fn).read()
        text = remove_special(original).split('\n')
        poem = text[3:]
        basename = os.path.basename(fn)
        basename = os.path.splitext(basename)[0]
        basenames.append(basename)
        all_poems[basename] = {
            'url': text[0],
            'title': text[1],
            'author': text[2],
            'poem': poem
        }
        total_lines += len(poem)
        poem = '\n'.join(poem)
Example #8
0
import glob
import cv2
import imutils
import numpy as np
import keras

INPUT_IMAGE_SIZE =64
# The path to the directory where the original
# dataset was uncompressed
original_smiles_dataset_dir = '/Users/sidnpoo/Downloads/MLStuff_DoNotDelete/_DATASETS_/SmileFrownData/SMILEs/positives/positives7'
original_frowns_dataset_dir = '/Users/sidnpoo/Downloads/MLStuff_DoNotDelete/_DATASETS_/SmileFrownData/SMILEs/negatives/negatives7'



from utils import list_all_files
negative_paths = list(list_all_files(original_smiles_dataset_dir, ['.jpg']))
print('loaded', len(negative_paths), 'negative examples')
positive_paths = list(list_all_files(original_frowns_dataset_dir, ['.jpg']))
print ('loaded', len(positive_paths), 'positive examples')
examples = [(path, 0) for path in negative_paths] + [(path, 1) for path in positive_paths]

import numpy as np
from skimage.measure import block_reduce
from skimage.io import imread

def examples_to_dataset(examples, block_size=1):
    X = []
    y = []
    for path, label in examples:
        img = imread(path, as_grey=True)
        img = block_reduce(img, block_size=(block_size, block_size), func=np.mean)