Beispiel #1
0
def _load_proteins(subset_genes: Set[GeneLabel], output_csv=False):
    # Sort genes
    subset_genes = sorted(frozenset(subset_genes))
    if len(subset_genes) > 0:
        batches = grouper(subset_genes)
    else:
        batches = [None]

    first_chunk = True
    for batch in batches:
        try:
            # Special case for 'all'
            if batch is None:
                chunked_data = pd.read_hdf(_PROTEIN_META_HDF, 'protein_meta',
                                           chunksize=HDF5_CHUNKSIZE)
            else:
                batch = frozenset(batch)
                chunked_data = pd.read_hdf(_PROTEIN_META_HDF, 'protein_meta',
                                           local_variables=dict(batch=batch),
                                           where=f'protein in batch',
                                           chunksize=HDF5_CHUNKSIZE)

            for chunk in chunked_data:
                if not output_csv:
                    for __, row in chunk.iterrows():
                        yield ProteinInfo.from_row(row)
                else:
                    yield chunk.to_csv(index=False, header=first_chunk)
                first_chunk = False
        except NotImplementedError:
            raise Exception(sorted(batch))
Beispiel #2
0
def import_file(database, path, module, location=None, chunk_size=256):
    reader = csv_file(path, fields=module.FIELD_NAMES)

    for chunk in progressbar.progressbar(grouper(reader, chunk_size),
                                         max_value=line_count(path) //
                                         chunk_size):
        import_chunk(database, chunk, module, location=location)
Beispiel #3
0
 def test_grouper(self):
     data = [1, 2, 3, 4, 5]
     grouped = grouper(data, 2)
     self.assertEqual([1, 2], list(next(grouped)))
     self.assertEqual([3, 4], list(next(grouped)))
     self.assertEqual([5, None], list(next(grouped)))
     with self.assertRaises(StopIteration):
         next(grouped)
Beispiel #4
0
    def _audio_file_spectrogram(self, _file, nfft, duration, overlap):
        # first get the spec_params and let the client setup the canvas
        fs = _file.sample_rate
        spec_params = get_audio_spectrogram_params(_file, fs, duration, nfft,
                                                   overlap)
        self.send_spectrogram_new(spec_params)

        # now lets compute the spectrogram and send it over
        data = _file[:spec_params.nsamples].sum(axis=1)
        for chunk in grouper(data, spec_params.chunksize, spec_params.nfft):
            chunk = np.array(chunk)
            spec = spectrogram(chunk, spec_params)
            self.send_spectrogram_update(spec)
  def _audio_file_spectrogram(self, _file, nfft, duration, overlap):
    # first get the spec_params and let the client setup the canvas
    fs = _file.sample_rate
    spec_params = get_audio_spectrogram_params(
        _file, fs, duration, nfft, overlap)
    self.send_spectrogram_new(spec_params)

    # now lets compute the spectrogram and send it over
    data = _file[:spec_params.nsamples].sum(axis=1)
    for chunk in grouper(data, spec_params.chunksize, spec_params.nfft):
      chunk = np.array(chunk)
      spec = spectrogram(chunk, spec_params)
      self.send_spectrogram_update(spec)
Beispiel #6
0
 def load(self, iterable: Collection, batch_size=10000) -> None:
     written_rows_count = 0
     influxdb_records = []
     for index, batch in enumerate(grouper(iterable, batch_size)):
         for record in batch:
             if record:
                 influxdb_records.append(record)
         self.influxdb_client.write_points(influxdb_records,
                                           batch_size=1000)
         written_rows_count += len(influxdb_records)
         influxdb_records = []
         logger.info('InfluxDB load: {}/{} records written'.format(
             written_rows_count, len(iterable)))
Beispiel #7
0
 def get_buttons_routes(self, user_routes):
     # TODO: too many buttons
     routes_list = sorted(list(self.cds.bus_routes.keys()), key=natural_sort_key)
     routes_groups = list(grouper(8, routes_list))
     route_btns = [[InlineKeyboardButton('Hide', callback_data='hide')],
                   [InlineKeyboardButton('All', callback_data='all'),
                    InlineKeyboardButton('None', callback_data='none')]
                   ] + [
                      [InlineKeyboardButton(f"{x}{'+' if x in user_routes else ''}", callback_data=x)
                       for x in group if x]
                      for group in routes_groups]
     keyboard = route_btns + [
     ]
     return keyboard
Beispiel #8
0
    def insert_hashes(self, record_id, hashes):
        values = []
        for hash, offset in hashes:
            values.append({
                'hash': hash,
                'record_id': record_id,
                'offset': offset
            })
        rows = []
        for split_values in grouper(values, 1000):
            for row in split_values:
                rows.append(Fingerprints(**row))

        self.session.add_all(rows)
        self.session.commit()
Beispiel #9
0
    def return_matches(self, hashes):
        mapper = {}
        for hash, offset in hashes:
            mapper[hash.upper()] = offset

        values = mapper.keys()

        for split_values in grouper(values, 1000):
            records = self.session.query(Fingerprints.hash,
                                         Fingerprints.record_id,
                                         Fingerprints.offset).filter(
                                             Fingerprints.hash.in_(
                                                 list(split_values)))

            for row in records.all():
                key = bytes("{0}".format(row[0]), encoding="ascii")
                if key in mapper:
                    yield (row[1], row[2] - mapper[key])
Beispiel #10
0
def _load_domains(subset_genes: Set[GeneLabel],
                  columns=None):

    if not subset_genes:
        return []

    for batch in grouper(subset_genes):

        try:
            chunked_data = pd.read_hdf(_ANNOTATION_FILE, 'domains',
                                       local_variables=dict(batch=batch),
                                       where=f'protein in batch',
                                       columns=columns,
                                       chunksize=HDF5_CHUNKSIZE)

            for chunk in chunked_data:
                for __, row in chunk.iterrows():
                    yield row.to_dict()

        except NotImplementedError:
            raise Exception(sorted(batch))
Beispiel #11
0
def _load_edges(subset_genes: Set[GeneLabel], columns=None):

    if not subset_genes:
        return []

    seen_edges = set()

    # Since tables crashes with large "in" queries
    # We will fetch data in batches

    for batch in grouper(subset_genes, n=BATCH_SIZE_EDGES):

        # Note that to fetch the batched data correctly we need to fetch
        # all interactions for the proteins and then post-filter them
        # to subset

        try:
            chunked_data = pd.read_hdf(
                _NETWORK_FILE,
                'edges',
                local_variables=dict(batch=batch),
                where=f'protein_a in batch or protein_b in batch',
                columns=columns,
                chunksize=HDF5_CHUNKSIZE)

            for chunk in chunked_data:

                chunk = chunk.query(
                    'protein_a in @subset_genes and protein_b in @subset_genes'
                )

                for __, row in chunk.iterrows():

                    edge = row['protein_a'], row['protein_b']
                    if edge not in seen_edges:
                        yield row.to_dict()
                        seen_edges.add(edge)

        except NotImplementedError:
            raise Exception(sorted(batch))
Beispiel #12
0
def _load_subset(subset_genes: Set[GeneLabel], subset_keys=None):

    if not subset_genes:
        return []

    for batch in grouper(subset_genes):

        try:
            chunked_data = pd.read_hdf(_PTM_RESPONSE_FILE,
                                       'ptm_matrix',
                                       local_variables=dict(batch=batch),
                                       where=f'protein in batch',
                                       chunksize=HDF5_CHUNKSIZE)
        except NotImplementedError:
            raise Exception(sorted(batch))

        for chunk in chunked_data:

            for __, row in chunk.iterrows():
                # NaNs will be undefined
                d = {}
                row = row.dropna()

                for col, value in row.iteritems():
                    if col == 'protein':
                        d[col] = value
                        continue

                    ptm, __, key = col.partition('-')

                    if subset_keys is not None and key not in subset_keys:
                        continue

                    try:
                        d[ptm][key] = value
                    except KeyError:
                        d[ptm] = {key: value}

                yield d
Beispiel #13
0
def _load_nodes(subset_genes: Set[GeneLabel], columns=None):

    if not subset_genes:
        return []

    for batch in grouper(subset_genes):

        # Note that to fetch the batched data correctly we need to fetch
        # all interactions for the proteins and then post-filter them
        # to subset

        try:
            data = pd.read_hdf(_NETWORK_FILE,
                               'nodes',
                               local_variables=dict(batch=batch),
                               where=f'protein in batch',
                               columns=columns)
        except NotImplementedError:
            raise Exception(sorted(batch))

        for __, row in data.iterrows():
            # NaNs will be undefine
            row = row.dropna()
            yield row.to_dict()
Beispiel #14
0
def _load_data(subset_genes: Set[GeneLabel],
               columns=None):

    if len(subset_genes) == 0:
        # Return empty dataframe if subset is specified but empty
        return []

    # Pytables fails somehow when searching for a large subset.
    for batch in grouper(subset_genes):
        batch = frozenset(batch)

        try:
            chunked_data = pd.read_hdf(_MATRIX_FILE, 'enrichment_data_minimal',
                                       local_variables=dict(batch=batch),
                                       where=f'{GENE_LABEL_COLUMN} in batch',
                                       columns=columns, chunksize=HDF5_CHUNKSIZE)

            for chunk in chunked_data:
                yield from chunk.to_dict(orient='records')
                # for __, row in chunk.iterrows():
                #     yield row.to_dict()

        except NotImplementedError:
            raise Exception(sorted(batch))
Beispiel #15
0
def altsplit(triangles):
    return chain.from_iterable(zip(*group) for group in grouper(triangles, 3))