Esempio n. 1
0
def populate_csv_file():

    backup(csv_path)

    f_csv = CsvFile(csv_path, 'w')

    seen_courses = set()

    course_tags_soup = BeautifulSoup(open(html_path), 'html.parser')

    for course_tag in course_tags_soup.findAll('a'):

        displayed_course_information = course_tag.contents[0]

        department_and_level_regex = '[A-Zx0-9 \.\-]+'

        if re.match('%s \- ' % department_and_level_regex, displayed_course_information):
            department_and_level, title = displayed_course_information.split(' - ', 1)
        elif re.search(' \- %s($|\s)' % department_and_level_regex, displayed_course_information):
            title, department_and_level = displayed_course_information.rsplit(' - ', 1)
        else:
            title, department_and_level = displayed_course_information, ''

        url = course_tag.get('href')

        if (title, department_and_level) not in seen_courses:
            f_csv.add_row(title, department_and_level, url)

        seen_courses.add((title, department_and_level))

    f_csv.close()
Esempio n. 2
0
 def load(self, filename):
     """Retrieves serialized pd data from Google Cloud Storage."""
     global profiler
     file_path = self.name() + '/' + filename
     profiler.add_event("  Loading file...")
     gcs_file = CsvFile(file_path)
     profiler.add_event("  Loaded. Reading...")
     self._cache = gcs_file.read()
     profiler.add_event("  load() done")
Esempio n. 3
0
    def save_as_csv(self, url, file_path):
        csv = CsvFile('./data/test.csv')

        html = './data/view-source_https___www.worldometers.info_coronavirus_.html'
        html = './data/table.html'

        f = open(html, "r")
        soup = BeautifulSoup(f, 'html.parser')
        table = soup.find(id="main_table_countries_today")

        # add header
        columns = table.findAll('th')
        output_row = []
        for column in columns:
            output_row.append(csv.clean(column.text))

        csv.add_columns(output_row)
        #csv.delete_column('A')

        # add rows
        output_rows = []
        for table_row in table.findAll('tr'):
            columns = table_row.findAll('td')
            output_row = []
            for column in columns:
                output_row.append(column.text)
            output_rows.append(output_row)
            csv.add_row(output_row)

        csv.save('./data/abc.csv')
Esempio n. 4
0
 def save(self, filename):
     """Writes in-memory dictionary to GCS a JSON string."""
     global profiler
     file_path = self.name() + '/' + filename
     profiler.add_event("  Loading file...")
     gcs_file = CsvFile(file_path)
     profiler.add_event("  Loaded. Writing...")
     gcs_file.write(self._cache)
     self._cache = []
     gcs_file = None
     profiler.add_event("  save() done.")
Esempio n. 5
0
    def clean_gcs_bucket(self, bucket):
        """Deletes all files in a given GCS bucket. Used for emptying out
        cluttered buckets, like our backup buckets."""

        bucket_files = CsvFile.list_bucket_files(bucket)
        messages = {}

        for filename in bucket_files:
            msg = CsvFile.delete(filename)
            messages[filename] = msg

        return messages
Esempio n. 6
0
    def __init__(self, database, path = None,
                 table = None,  buffer = None,
                 lines_per_chunk = 10000000000):

        self.database = database
        self.table = table

        self.file = None
        self.strinhg = None

        self.csv_file = CsvFile(path = path,
                                buffer = buffer)

        self.csv_file.get_dialect()
        self.csv_file.get_headings()
        self.csv_file.parse_headings()
        self.csv_file.guess_types()
        self.lines_per_chunk = lines_per_chunk
        self.lines = self.csv_file.chunk(lines_per_chunk)
        self.chunks = self.csv_file.chunks
        self.chunk_status = []
Esempio n. 7
0
class FlatFileSaveSet(object):

    def __init__(self, database, path = None,
                 table = None,  buffer = None,
                 lines_per_chunk = 10000000000):

        self.database = database
        self.table = table

        self.file = None
        self.strinhg = None

        self.csv_file = CsvFile(path = path,
                                buffer = buffer)

        self.csv_file.get_dialect()
        self.csv_file.get_headings()
        self.csv_file.parse_headings()
        self.csv_file.guess_types()
        self.lines_per_chunk = lines_per_chunk
        self.lines = self.csv_file.chunk(lines_per_chunk)
        self.chunks = self.csv_file.chunks
        self.chunk_status = []

    def get_first_generator(self, chunk):
        save_data = self.csv_file.iterate_csv(chunk, as_dict = True)
        for num, line in enumerate(save_data):
            if "prev" not in (line.get("_core_id"), line.get("id")):
                break
        else:
            return ()

        try:
            save_data.send(1)
            save_data.next()
        except StopIteration:
            pass

        generator = self.csv_file.iterate_csv(chunk, as_dict = True)

        return (num, islice(generator, num, None))


    def get_end_generator(self, chunk):

        if chunk + 1 not in self.csv_file.chunks:
            return []
        save_data = self.csv_file.iterate_csv(chunk + 1, as_dict = True,
                                              no_end = True)

        num = 0
        for num, line in enumerate(save_data):
            if "prev" not in (line.get("_core_id"), line.get("id")):
                break
        else:
            num = num + 1

        try:
            save_data.send(1)
            save_data.next()
        except StopIteration:
            pass

        generator = self.csv_file.iterate_csv(chunk + 1, as_dict = True,
                                                  no_end = True)

        return (num, islice(generator, 0, num))

    def load_chunk(self, chunk, validate = True):

        try:
            start, first_generator = self.get_first_generator(chunk)
        except ValueError:
            return ChunkStatus((0,0), "empty chunk")

        range_start = chunk * self.lines_per_chunk + start + 1

        try:
            end, end_generator = self.get_end_generator(chunk)
            range_end = (chunk + 1) * self.lines_per_chunk + end
        except ValueError:
            range_end, end_generator = self.lines, []

        save_data = chain(first_generator, end_generator)

        save_set = MultipleSaveSet(self.database, save_data,
                                   table = self.table)
        range = (range_start, range_end)

        try:
            save_set_errors = save_set.save(validate = validate)
        except sa.orm.exc.ConcurrentModificationError, e:
            return ChunkStatus(range, "locking error", error = e)
        except Exception, e:
            raise
            return ChunkStatus(range, "unknown error", error = e)