def populate_csv_file(): backup(csv_path) f_csv = CsvFile(csv_path, 'w') seen_courses = set() course_tags_soup = BeautifulSoup(open(html_path), 'html.parser') for course_tag in course_tags_soup.findAll('a'): displayed_course_information = course_tag.contents[0] department_and_level_regex = '[A-Zx0-9 \.\-]+' if re.match('%s \- ' % department_and_level_regex, displayed_course_information): department_and_level, title = displayed_course_information.split(' - ', 1) elif re.search(' \- %s($|\s)' % department_and_level_regex, displayed_course_information): title, department_and_level = displayed_course_information.rsplit(' - ', 1) else: title, department_and_level = displayed_course_information, '' url = course_tag.get('href') if (title, department_and_level) not in seen_courses: f_csv.add_row(title, department_and_level, url) seen_courses.add((title, department_and_level)) f_csv.close()
def load(self, filename): """Retrieves serialized pd data from Google Cloud Storage.""" global profiler file_path = self.name() + '/' + filename profiler.add_event(" Loading file...") gcs_file = CsvFile(file_path) profiler.add_event(" Loaded. Reading...") self._cache = gcs_file.read() profiler.add_event(" load() done")
def save_as_csv(self, url, file_path): csv = CsvFile('./data/test.csv') html = './data/view-source_https___www.worldometers.info_coronavirus_.html' html = './data/table.html' f = open(html, "r") soup = BeautifulSoup(f, 'html.parser') table = soup.find(id="main_table_countries_today") # add header columns = table.findAll('th') output_row = [] for column in columns: output_row.append(csv.clean(column.text)) csv.add_columns(output_row) #csv.delete_column('A') # add rows output_rows = [] for table_row in table.findAll('tr'): columns = table_row.findAll('td') output_row = [] for column in columns: output_row.append(column.text) output_rows.append(output_row) csv.add_row(output_row) csv.save('./data/abc.csv')
def save(self, filename): """Writes in-memory dictionary to GCS a JSON string.""" global profiler file_path = self.name() + '/' + filename profiler.add_event(" Loading file...") gcs_file = CsvFile(file_path) profiler.add_event(" Loaded. Writing...") gcs_file.write(self._cache) self._cache = [] gcs_file = None profiler.add_event(" save() done.")
def clean_gcs_bucket(self, bucket): """Deletes all files in a given GCS bucket. Used for emptying out cluttered buckets, like our backup buckets.""" bucket_files = CsvFile.list_bucket_files(bucket) messages = {} for filename in bucket_files: msg = CsvFile.delete(filename) messages[filename] = msg return messages
def __init__(self, database, path = None, table = None, buffer = None, lines_per_chunk = 10000000000): self.database = database self.table = table self.file = None self.strinhg = None self.csv_file = CsvFile(path = path, buffer = buffer) self.csv_file.get_dialect() self.csv_file.get_headings() self.csv_file.parse_headings() self.csv_file.guess_types() self.lines_per_chunk = lines_per_chunk self.lines = self.csv_file.chunk(lines_per_chunk) self.chunks = self.csv_file.chunks self.chunk_status = []
class FlatFileSaveSet(object): def __init__(self, database, path = None, table = None, buffer = None, lines_per_chunk = 10000000000): self.database = database self.table = table self.file = None self.strinhg = None self.csv_file = CsvFile(path = path, buffer = buffer) self.csv_file.get_dialect() self.csv_file.get_headings() self.csv_file.parse_headings() self.csv_file.guess_types() self.lines_per_chunk = lines_per_chunk self.lines = self.csv_file.chunk(lines_per_chunk) self.chunks = self.csv_file.chunks self.chunk_status = [] def get_first_generator(self, chunk): save_data = self.csv_file.iterate_csv(chunk, as_dict = True) for num, line in enumerate(save_data): if "prev" not in (line.get("_core_id"), line.get("id")): break else: return () try: save_data.send(1) save_data.next() except StopIteration: pass generator = self.csv_file.iterate_csv(chunk, as_dict = True) return (num, islice(generator, num, None)) def get_end_generator(self, chunk): if chunk + 1 not in self.csv_file.chunks: return [] save_data = self.csv_file.iterate_csv(chunk + 1, as_dict = True, no_end = True) num = 0 for num, line in enumerate(save_data): if "prev" not in (line.get("_core_id"), line.get("id")): break else: num = num + 1 try: save_data.send(1) save_data.next() except StopIteration: pass generator = self.csv_file.iterate_csv(chunk + 1, as_dict = True, no_end = True) return (num, islice(generator, 0, num)) def load_chunk(self, chunk, validate = True): try: start, first_generator = self.get_first_generator(chunk) except ValueError: return ChunkStatus((0,0), "empty chunk") range_start = chunk * self.lines_per_chunk + start + 1 try: end, end_generator = self.get_end_generator(chunk) range_end = (chunk + 1) * self.lines_per_chunk + end except ValueError: range_end, end_generator = self.lines, [] save_data = chain(first_generator, end_generator) save_set = MultipleSaveSet(self.database, save_data, table = self.table) range = (range_start, range_end) try: save_set_errors = save_set.save(validate = validate) except sa.orm.exc.ConcurrentModificationError, e: return ChunkStatus(range, "locking error", error = e) except Exception, e: raise return ChunkStatus(range, "unknown error", error = e)