Ejemplos de RDD.filter en Python

Lenguaje de programación: Python

Namespace/Package Name: pyspark

Clase / Tipo: RDD

Método / Función: filter

Ejemplos en hotexamples.com: 10

Python RDD.filter - 10 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de pyspark.RDD.filter extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

RDD(30)

map(30)

flatMap(16)

count(11)

mapPartitionsWithIndex(10)

getNumPartitions(9)

filter(9)

repartition(6)

mapPartitions(6)

toDF(5)

collect(5)

mapValues(5)

groupByKey(4)

isEmpty(4)

coalesce(3)

cache(3)

take(3)

toDebugString(2)

persist(2)

unpersist(2)

zip(2)

zipWithIndex(2)

__init__(2)

_reserialize(2)

first(2)

distinct(2)

join(2)

sum(1)

_to_java_object_rdd(1)

union(1)

cogroup(1)

countApproxDistinct(1)

sortByKey(1)

subtractByKey(1)

sortBy(1)

sample(1)

randomSplit(1)

foreach(1)

name(1)

groupBy(1)

keys(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: vocabulary2id.py Proyecto: jayavardhanr/code2vec-1

    def build_vocabularies(self, rows: RDD):
        """
        Process rows to gather values and paths with their frequencies.
        :param rows: row structure is ((key, doc), val) where:
            * key: str with the path context
            * doc: file name
            * val: number of occurrences of key in doc
        """

        def _flatten_row(row: Row):
            # 2: removes the namespace v. from the string to parse it as tuple
            k = Vocabulary2Id._unstringify_path_context(row)
            return [(k[0], 1), (k[1], 1), (k[2], 1)]

        rows = rows \
            .flatMap(_flatten_row) \
            .reduceByKey(operator.add) \
            .persist()

        values = rows.filter(lambda x: type(x[0]) == str).collect()
        paths = rows.filter(lambda x: type(x[0]) == tuple).collect()

        value2index = {w: id for id, (w, _) in enumerate(values)}
        path2index = {w: id for id, (w, _) in enumerate(paths)}
        value2freq = {w: freq for _, (w, freq) in enumerate(values)}
        path2freq = {w: freq for _, (w, freq) in enumerate(paths)}

        rows.unpersist()

        return value2index, path2index, value2freq, path2freq

Ejemplo n.º 2

Mostrar archivo

def clean_claims(claims: RDD, b_item_map: Broadcast):
    def clean(claim):
        item_map = b_item_map.value
        if claim.datatype == 'wikibase-item':
            if claim.object in item_map:
                claim = claim._replace(object=item_map[claim.object])
                return claim
            else:
                return None
        elif claim.datatype == 'quantity':
            unit = claim.object.unit
            unit = unit.split('/')[-1]
            if unit in item_map:
                claim = claim._replace(object=item_map[unit])
                return claim
            else:
                return None
        return claim

    dt_filter = {
        'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'
    }

    return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(
        lambda c: c is not None)

Ejemplo n.º 3

Mostrar archivo

def clean_claims(claims: RDD, b_item_map: Broadcast):
    def clean(claim):
        item_map = b_item_map.value
        if claim.datatype == "wikibase-item":
            if claim.object in item_map:
                claim = claim._replace(object=item_map[claim.object])
                return claim
            else:
                return None
        elif claim.datatype == "quantity":
            unit = claim.object.unit
            unit = unit.split("/")[-1]
            if unit in item_map:
                claim = claim._replace(object=item_map[unit])
                return claim
            else:
                return None
        return claim

    dt_filter = {
        "wikibase-item", "string", "monolingualtext", "quantity", "time"
    }

    return (claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(
        lambda c: c is not None))

Ejemplo n.º 4

Mostrar archivo

def extract_property_map(parsed_wikidata: RDD):
    def parse_property(prop):
        label = prop["labels"]["en"]["value"]
        return prop["id"], label

    return (parsed_wikidata.filter(lambda d: d["type"] == "property").map(
        parse_property).collectAsMap())

Ejemplo n.º 5

Mostrar archivo

Archivo: analyze.py Proyecto: Inocustonner/dnsTunnelIdentifier

def analyze(rddDns: RDD) -> Dict[str, Result]:
  # filter out trustedDNS
  log = getLogger()
  premiseCheck_ = functools.partial(premiseCheck, 
                                    Global.ALLOWED_NAME_LEN, 
                                    Global.RESTRICTED_SYMS, 
                                    Global.MAX_BODY_SIZE,
                                    Global.MIN_TTL)
  
  timer = Timer()
  # cache bcs only this rdd will be used in the application
  ipPartGen = rddDns.filter(compose(operator.not_, premiseCheck_)).map(lambda dns: str(dns.sip)).distinct().glom().toLocalIterator()
  log.info(f'Time spent on premis analysis = {timer.elapsed()}')
  # log.debug(ips)

  timer = Timer()
  ipdoms = {}
  # REFACTOR THIS STIH

    
  for ipPart in ipPartGen:
    for ip in set(ipPart):
      if ip not in ipdoms:
        log.debug(ip)
        ipdoms[ip] = np.array(
          rddDns.filter(
            lambda dns: ip in [dns.dip, dns.sip]).map(
              lambda dns: parseDomain(str(dns.getName()))).distinct().collect())
        log.debug(ipdoms.get(ip))
      
  log.info(f'Time spent on searching packets for chosen IPs = {timer.elapsed()}')

  timer = Timer()
  result = []
  for ip, doms in ipdoms.items():
    result.append((str(ip), repr(unigramAnalysis(doms))))
  log.info(f'Time spent on unigram distribution analysis = {timer.elapsed()}')

  rddDns.unpersist()
  
  return dict(result)

Ejemplo n.º 6

Mostrar archivo

def compute_fdr_and_filter_results(
    moldb: MolecularDB,
    fdr: FDR,
    ion_formula_map_df: pd.DataFrame,
    formula_metrics_df: pd.DataFrame,
    formula_images_rdd: pyspark.RDD,
    scoring_model: Optional[ScoringModel],
) -> Tuple[pd.DataFrame, pyspark.RDD, FdrDiagnosticBundle]:
    """Compute FDR for database annotations and filter them."""

    moldb_formula_map_df = ion_formula_map_df[ion_formula_map_df.moldb_id ==
                                              moldb.id].drop('moldb_id',
                                                             axis=1)
    moldb_metrics_fdr_df = compute_fdr(fdr, formula_metrics_df,
                                       moldb_formula_map_df, scoring_model)

    if not moldb.targeted:
        max_fdr = 0.5
        moldb_metrics_fdr_df = moldb_metrics_fdr_df[
            moldb_metrics_fdr_df.fdr <= max_fdr]
    else:
        # fdr is not null for target ion formulas
        moldb_metrics_fdr_df = moldb_metrics_fdr_df[~moldb_metrics_fdr_df.fdr.
                                                    isnull()]

    moldb_ion_images_rdd = formula_images_rdd.filter(
        lambda kv: kv[0] in moldb_metrics_fdr_df.index  # pylint: disable=cell-var-from-loop
    )
    moldb_ion_metrics_df = moldb_metrics_fdr_df.merge(fdr.target_modifiers_df,
                                                      left_on='modifier',
                                                      right_index=True)

    # Extract the metrics for just this database, avoiding duplicates and handling missing rows
    all_metrics_df = formula_metrics_df.merge(
        moldb_formula_map_df.index.rename(
            'formula_i').drop_duplicates().to_frame(index=True)[[]],
        left_index=True,
        right_index=True,
        how='inner',
    )

    formula_map_df = (moldb_formula_map_df.drop(
        columns=['ion_formula']).rename_axis(index='formula_i').reset_index())
    fdr_bundle = FdrDiagnosticBundle(
        decoy_sample_size=fdr.decoy_sample_size,
        decoy_map_df=fdr.td_df,
        formula_map_df=formula_map_df,
        metrics_df=all_metrics_df,
    )

    return moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle

Ejemplo n.º 7

Mostrar archivo

def filter_big_time_step(rdd: RDD, min_step: float, max_step: float) -> RDD:
    """ Some plane sometimes pass out for a long time (1k sec and more) without moving.
    We remove those planes from the records (SHALL WE ???)
    """
    def map_get_max_time_step(record):
        time = record.Time
        max_step = 0
        min_step = 0
        if len(time) > 1:
            arr = [t2 - t1 for t1, t2 in zip(time[:-1], time[1:])]
            max_step = np.max(arr)
            min_step = np.min(arr)
        return record, min_step, max_step

    rdd = rdd.map(map_get_max_time_step)

    rdd = rdd.filter(lambda r: r[1] >= min_step * 1000 and r[2] <= max_step * 1000) \
             .map(lambda r: r[0])  # remove from the record max and min steps

    # print(f"Applied filter on time steps of size {(min_step, max_step)}, remains {rdd.count()} records")
    return rdd
    pass

Ejemplo n.º 8

Mostrar archivo

Archivo: wikidata.py Proyecto: Pinafore/qb

def clean_claims(claims: RDD, b_item_map: Broadcast):
    def clean(claim):
        item_map = b_item_map.value
        if claim.datatype == 'wikibase-item':
            if claim.object in item_map:
                claim = claim._replace(object=item_map[claim.object])
                return claim
            else:
                return None
        elif claim.datatype == 'quantity':
            unit = claim.object.unit
            unit = unit.split('/')[-1]
            if unit in item_map:
                claim = claim._replace(object=item_map[unit])
                return claim
            else:
                return None
        return claim

    dt_filter = {'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'}

    return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(lambda c: c is not None)

Ejemplo n.º 9

Mostrar archivo

Archivo: wick.py Proyecto: rdguerrerom/drudge

    def normal_order(self, terms: RDD, **kwargs):
        """Normal order the terms according to generalized Wick theorem.

        The actual expansion is based on the information given in the subclasses
        by the abstract properties.

        """
        comparator = kwargs.pop('comparator', self.comparator)
        contractor = kwargs.pop('contractor', self.contractor)
        if len(kwargs) != 0:
            raise ValueError('Invalid arguments to Wick normal order', kwargs)

        phase = self.phase
        symms = self.symms
        resolvers = self.resolvers

        terms.cache()
        terms_to_proc = terms.filter(lambda x: len(x.vecs) > 1)
        keep_top = 0 if comparator is None else 1
        terms_to_keep = terms.filter(lambda x: len(x.vecs) <= keep_top)
        terms_to_proc.cache()
        if terms_to_proc.count() == 0:
            return terms_to_keep

        # Triples: term, contractions, schemes.
        wick_terms = terms_to_proc.map(lambda x: _prepare_wick(
            x, comparator, contractor, symms.value, resolvers.value))

        if self._wick_parallel == 0:

            normal_ordered = wick_terms.flatMap(lambda x: [
                _form_term_from_wick(x[0], x[1], phase, resolvers.value, i)
                for i in x[2]
            ])

        elif self._wick_parallel == 1:

            flattened = wick_terms.flatMap(
                lambda x: [(x[0], x[1], i) for i in x[2]])
            if self._num_partitions is not None:
                flattened = flattened.repartition(self._num_partitions)

            normal_ordered = flattened.map(lambda x: _form_term_from_wick(
                x[0], x[1], phase, resolvers.value, x[2]))

        elif self._wick_parallel == 2:

            # This level of parallelism is reserved for really hard problems.
            expanded = []
            for term, contrs, schemes in wick_terms.collect():
                # To work around a probable Spark bug.  Problem occurs when we
                # have closures inside a loop to be distributed out.
                form_term = functools.partial(_form_term_from_wick_bcast, term,
                                              contrs, phase, resolvers)

                curr = self._ctx.parallelize(schemes).map(form_term)
                expanded.append(curr)
                continue

            normal_ordered = self._ctx.union(expanded)

        else:
            raise ValueError('Invalid Wick expansion parallel level',
                             self._wick_parallel)

        return terms_to_keep.union(normal_ordered)

Ejemplo n.º 10

Mostrar archivo

def remove_header(rdd: RDD) -> RDD:
    header_rdd: RDD = rdd.first()
    return rdd.filter(lambda row: row != header_rdd)