Ejemplo n.º 1
0
def make_blocks(num_records=2000, codec='null'):
    records = make_records(num_records)

    new_file = BytesIO()
    fastavro.writer(new_file, schema, records, codec=codec)

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records
Ejemplo n.º 2
0
def avro_to_pandas(fname, reader_schema=None, num_cores=None):
    """    Converts Avro file to pandas dataframe using parallel processing.
    :param fname: path of avro file to be converted
    :param reader_schema: if schema of avro file is available (optional)
    :param num_cores: Number of processors to use (optional). By default uses all processors available
    :return: Pandas dataframe
    """
    if num_cores is None:
        num_cores = cpu_count()
    with open(fname, 'rb') as fo:
        avro_reader = block_reader(fo, reader_schema)
        results = Parallel(n_jobs=num_cores)(delayed(process_block)(i) for i in avro_reader)
    results = list(itertools.chain.from_iterable(results))
    return DataFrame(results)
Ejemplo n.º 3
0
def make_blocks(num_records=2000, codec='null'):
    records = make_records(num_records)

    new_file = MemoryIO()
    fastavro.writer(new_file, schema, records, codec=codec)

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records
Ejemplo n.º 4
0
 def __init__(self, datafile, datadir='', n_records=None, n_jobs=1):
     self.datadir = datadir
     self.datafile = datafile
     self.fo = open(os.path.join(self.datadir, self.datafile), "rb")
     self.reader = block_reader(self.fo)
     # Dictionary mapping categories to ordered lists of ids of all items in the category
     self.itemsByCategory = {}
     # Dictionary mapping item ids to their position in the input file
     self.itemPositions = {}
     self.logger = logging.getLogger(
         'similar_item_service.search.SearchIndex')
     self._build_(n_records=n_records, n_jobs=n_jobs)
     # Keep track of current block and index in datafile to avoid unecessarily reseeking/iteration
     self.curBlock = None
     self.curIdx = None
Ejemplo n.º 5
0
def make_blocks(num_records=2000, codec='null', write_to_disk=False):
    records = make_records(num_records)

    new_file = NamedTemporaryFile() if write_to_disk else MemoryIO()
    fastavro.writer(new_file, schema, records, codec=codec)
    bytes = new_file.tell()

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records, bytes
Ejemplo n.º 6
0
def make_blocks(num_records=2000, codec='null', write_to_disk=False):
    records = make_records(num_records)

    new_file = NamedTemporaryFile() if write_to_disk else MemoryIO()
    fastavro.writer(new_file, schema, records, codec=codec)
    bytes = new_file.tell()

    new_file.seek(0)
    block_reader = fastavro.block_reader(new_file, schema)

    blocks = list(block_reader)

    new_file.close()

    return blocks, records, bytes
Ejemplo n.º 7
0
 def _get_(self, blockStart, index):
     if self.curBlock is None or self.curBlock[
             0] != blockStart or self.curIdx > index:
         self.fo.seek(blockStart)
         try:
             self.curBlock = (blockStart, next(self.reader))
         except StopIteration:
             self.fo.seek(0)
             self.reader = block_reader(self.fo)
             self.fo.seek(blockStart)
             self.curBlock = (blockStart, next(self.reader))
         self.curIdx = 0
     for item in self.curBlock[1]:
         self.curIdx += 1
         if self.curIdx > index:
             return item
Ejemplo n.º 8
0
def read_avro_blocks(path, logger=None):
    """
    Reads the avro file in argument and returns an iterator
    @param path: full path of the avro file to read
    @return: avro blocks iterator
    """
    if not os.path.exists(path):
        if logger:
            logger.error(f"No file found: {path}")
        else:
            print(f"No file found: {path}")

    with open(path, "rb") as f:
        reader = fastavro.block_reader(f)
        for block in reader:
            yield block
Ejemplo n.º 9
0
 def load(cls, fname):
     '''
     Load a pickled search index from file fname
     '''
     index = cls.__new__(cls)
     super(SearchIndex, index).__init__()
     index.logger = logging.getLogger(
         'similar_item_service.search.SearchIndex')
     with open(fname, 'rb') as f:
         index.datadir, index.datafile, index.itemsByCategory, index.itemPositions = pickle.load(
             f)
     index.logger.info(
         f"Loading search index for {os.path.join(index.datadir, index.datafile)} from {fname}"
     )
     index.fo = open(os.path.join(index.datadir, index.datafile), "rb")
     index.reader = block_reader(index.fo)
     index.curBlock = None
     index.curIdx = None
     return index