def unzip_to_temp(zipurl, extract=True): """ :param zipurl: url of MapPluto Zip file :return: temporary directory of unzipped shapefiles """ dir = mkdtemp() with urlopen(zipurl) as zipresp: print(f"Downloading ZIPFile {zipurl}") with ZipFile(BytesIO(zipresp.read())) as zfile: if not extract: files = zfile.namelist() return files else: print("Unzipping file") zfile.extractall(dir) # does the shapefile contain more zipfiles? child_zips = list_all_files(dir, ['zip']) if len(child_zips) > 0: unzip_child(child_zips) return (dir)
import utils OSS_ENDPOINT = os.getenv('OSS_ENDPOINT') OSS_AK = os.getenv('OSS_AK') OSS_SK = os.getenv('OSS_SK') OSS_BUCKET = os.getenv('OSS_BUCKET') OSS_REMOTE_ROOT = os.getenv('OSS_REMOTE_ROOT', '') SRC_ROOT = os.getenv('SRC_ROOT') SKIP_UPLOADED = True if (os.getenv('SKIP_UPLOADED') == 'true') else False assert OSS_ENDPOINT assert OSS_AK assert OSS_SK assert SRC_ROOT auth = oss2.Auth(OSS_AK, OSS_SK) bucket = oss2.Bucket(auth, OSS_ENDPOINT, OSS_BUCKET) if not SRC_ROOT.endswith('/'): SRC_ROOT += '/' SRC_ROOT_LEN = len(SRC_ROOT) for f in utils.list_all_files(SRC_ROOT): key = f[SRC_ROOT_LEN:] if OSS_REMOTE_ROOT: key = os.path.join(OSS_REMOTE_ROOT, key) logging.debug('start uploading file %s to %s', f, key) if SKIP_UPLOADED and bucket.object_exists(key): continue bucket.put_object_from_file(key, f)
import openpyxl from openpyxl import Workbook from constant import REFERENCE_FILE, TEST_FILES, TEST_PDF_PATH app = Flask('mongo') app.config['MONGO_DBNAME'] = 'cpa_database' app.config['MONGO_URI'] = 'mongodb://localhost:27017/cpa_database' mongo = PyMongo(app) #path = "/Users/shravanc/flask/flask_apps/cpa/UserSamples" path = "/Users/shravanc/Desktop/cpa_report/good" path = "/Users/shravanc/Desktop/CPA_files/new_certificates/data" files = list_all_files(path) #files = list_all_files(TEST_FILES) upload_path = TEST_PDF_PATH #"/Users/shravanc/flask/flask_apps/cpa_certificate_extraction/development/uploads" print("ALL_FILES====>", files) wb = Workbook() """ def update_excel_sheet(result, name): sheet = wb.create_sheet(name) for index, (key, value) in enumerate(result.items()): if index < 4: continue sheet.cell(row=1+index, column=1).value = str(key) sheet.cell(row=1+index, column=2).value = str(value)
# Retrieve dataset files url = 'https://github.com/hromi/SMILEsmileD/archive/master.zip' print("Downloading dataset") mx.test_utils.download(url) zip_ref = zipfile.ZipFile('master.zip', 'r') zip_ref.extractall() zip_ref.close() os.remove('master.zip') # Load training images negative_paths = list( list_all_files('SMILEsmileD-master/SMILEs/negatives/negatives7/', ['.jpg'])) print('loaded', len(negative_paths), 'negative examples') positive_paths = list( list_all_files('SMILEsmileD-master/SMILEs/positives/positives7/', ['.jpg'])) print('loaded', len(positive_paths), 'positive examples') examples = [(path, 0) for path in negative_paths] + [(path, 1) for path in positive_paths] # Convert loaded images into numpy arrays def examples_to_dataset(examples, size=32): X = [] y = [] for path, label in examples:
# This parses poetry from www.poetryfoundation.org into files that have a title on the first line, author on the second, and the rest is the poem. from bs4 import BeautifulSoup as bs from utils.list_all_files import * import hashlib # get_ipython().system('mkdir -p output') def get_hash(text): return hashlib.md5(text.encode('utf8')).hexdigest() conditions = ['.o-article .c-feature-hd', '.c-txt_attribution a', '.o-poem'] for fn in list_all_files('www.poetryfoundation.org/'): with open(fn) as f: html = f.read() soup = bs(html, 'html.parser') results = [soup.select(e) for e in conditions] if all(results): title = results[0][0].text.strip().split('\n')[0] author = results[1][0].text.strip().split('\n')[0] poem = results[2][0].get_text('\n').strip().split('\n') poem = [e.strip() for e in poem if len(e.strip())] poem = '\n'.join(poem) output_fn = 'output/' + get_hash(title + author) + '.txt' if len(poem) < 100: print(f'Parsing error: {fn}') continue
from shutil import rmtree import os client = storage.Client() bucket = client.bucket('raw-pluto') zip_links = pd.read_csv('./etc/zip_links.csv').loc[16:] for index, row in zip_links.iterrows(): print(f"make blob for {row.year}.csv") blob = bucket.blob(f"{row.year}.csv") dir = unzip_to_temp(row.path) shapes = list_all_files(dir, ['shp']) print("make pluto-year objects") py = PlutoYear(shapes) print("save py obj to tmp_data.csv") py.wkt_file.to_csv('tmp_data.csv', index = False) print("clean tmp dir") rmtree(dir) blob.upload_from_filename('./tmp_data.csv') os.remove('tmp_data.csv') if __name__ == '__main__': pass
'“': '"', '”': '"' } def remove_special(text): return ''.join([mapping[e] if e in mapping else e for e in text]) def strip_word(word): word = re.sub('^\W*|\W*$', '', word).lower() return word basenames = [] all_poems = {} total_lines = 0 words = set() for fn in list_all_files('../../scraping/poetry/output'): with open(fn) as f: original = open(fn).read() text = remove_special(original).split('\n') poem = text[3:] basename = os.path.basename(fn) basename = os.path.splitext(basename)[0] basenames.append(basename) all_poems[basename] = { 'url': text[0], 'title': text[1], 'author': text[2], 'poem': poem } total_lines += len(poem) poem = '\n'.join(poem)
import glob import cv2 import imutils import numpy as np import keras INPUT_IMAGE_SIZE =64 # The path to the directory where the original # dataset was uncompressed original_smiles_dataset_dir = '/Users/sidnpoo/Downloads/MLStuff_DoNotDelete/_DATASETS_/SmileFrownData/SMILEs/positives/positives7' original_frowns_dataset_dir = '/Users/sidnpoo/Downloads/MLStuff_DoNotDelete/_DATASETS_/SmileFrownData/SMILEs/negatives/negatives7' from utils import list_all_files negative_paths = list(list_all_files(original_smiles_dataset_dir, ['.jpg'])) print('loaded', len(negative_paths), 'negative examples') positive_paths = list(list_all_files(original_frowns_dataset_dir, ['.jpg'])) print ('loaded', len(positive_paths), 'positive examples') examples = [(path, 0) for path in negative_paths] + [(path, 1) for path in positive_paths] import numpy as np from skimage.measure import block_reduce from skimage.io import imread def examples_to_dataset(examples, block_size=1): X = [] y = [] for path, label in examples: img = imread(path, as_grey=True) img = block_reduce(img, block_size=(block_size, block_size), func=np.mean)