Ejemplo n.º 1
0
    def build_vocabularies(self, rows: RDD):
        """
        Process rows to gather values and paths with their frequencies.
        :param rows: row structure is ((key, doc), val) where:
            * key: str with the path context
            * doc: file name
            * val: number of occurrences of key in doc
        """

        def _flatten_row(row: Row):
            # 2: removes the namespace v. from the string to parse it as tuple
            k = Vocabulary2Id._unstringify_path_context(row)
            return [(k[0], 1), (k[1], 1), (k[2], 1)]

        rows = rows \
            .flatMap(_flatten_row) \
            .reduceByKey(operator.add) \
            .persist()

        values = rows.filter(lambda x: type(x[0]) == str).collect()
        paths = rows.filter(lambda x: type(x[0]) == tuple).collect()

        value2index = {w: id for id, (w, _) in enumerate(values)}
        path2index = {w: id for id, (w, _) in enumerate(paths)}
        value2freq = {w: freq for _, (w, freq) in enumerate(values)}
        path2freq = {w: freq for _, (w, freq) in enumerate(paths)}

        rows.unpersist()

        return value2index, path2index, value2freq, path2freq
Ejemplo n.º 2
0
def clean_claims(claims: RDD, b_item_map: Broadcast):
    def clean(claim):
        item_map = b_item_map.value
        if claim.datatype == 'wikibase-item':
            if claim.object in item_map:
                claim = claim._replace(object=item_map[claim.object])
                return claim
            else:
                return None
        elif claim.datatype == 'quantity':
            unit = claim.object.unit
            unit = unit.split('/')[-1]
            if unit in item_map:
                claim = claim._replace(object=item_map[unit])
                return claim
            else:
                return None
        return claim

    dt_filter = {
        'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'
    }

    return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(
        lambda c: c is not None)
Ejemplo n.º 3
0
def clean_claims(claims: RDD, b_item_map: Broadcast):
    def clean(claim):
        item_map = b_item_map.value
        if claim.datatype == "wikibase-item":
            if claim.object in item_map:
                claim = claim._replace(object=item_map[claim.object])
                return claim
            else:
                return None
        elif claim.datatype == "quantity":
            unit = claim.object.unit
            unit = unit.split("/")[-1]
            if unit in item_map:
                claim = claim._replace(object=item_map[unit])
                return claim
            else:
                return None
        return claim

    dt_filter = {
        "wikibase-item", "string", "monolingualtext", "quantity", "time"
    }

    return (claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(
        lambda c: c is not None))
Ejemplo n.º 4
0
def extract_property_map(parsed_wikidata: RDD):
    def parse_property(prop):
        label = prop["labels"]["en"]["value"]
        return prop["id"], label

    return (parsed_wikidata.filter(lambda d: d["type"] == "property").map(
        parse_property).collectAsMap())
Ejemplo n.º 5
0
def analyze(rddDns: RDD) -> Dict[str, Result]:
  # filter out trustedDNS
  log = getLogger()
  premiseCheck_ = functools.partial(premiseCheck, 
                                    Global.ALLOWED_NAME_LEN, 
                                    Global.RESTRICTED_SYMS, 
                                    Global.MAX_BODY_SIZE,
                                    Global.MIN_TTL)
  
  timer = Timer()
  # cache bcs only this rdd will be used in the application
  ipPartGen = rddDns.filter(compose(operator.not_, premiseCheck_)).map(lambda dns: str(dns.sip)).distinct().glom().toLocalIterator()
  log.info(f'Time spent on premis analysis = {timer.elapsed()}')
  # log.debug(ips)

  timer = Timer()
  ipdoms = {}
  # REFACTOR THIS STIH

    
  for ipPart in ipPartGen:
    for ip in set(ipPart):
      if ip not in ipdoms:
        log.debug(ip)
        ipdoms[ip] = np.array(
          rddDns.filter(
            lambda dns: ip in [dns.dip, dns.sip]).map(
              lambda dns: parseDomain(str(dns.getName()))).distinct().collect())
        log.debug(ipdoms.get(ip))
      
  log.info(f'Time spent on searching packets for chosen IPs = {timer.elapsed()}')

  timer = Timer()
  result = []
  for ip, doms in ipdoms.items():
    result.append((str(ip), repr(unigramAnalysis(doms))))
  log.info(f'Time spent on unigram distribution analysis = {timer.elapsed()}')

  rddDns.unpersist()
  
  return dict(result)
Ejemplo n.º 6
0
def compute_fdr_and_filter_results(
    moldb: MolecularDB,
    fdr: FDR,
    ion_formula_map_df: pd.DataFrame,
    formula_metrics_df: pd.DataFrame,
    formula_images_rdd: pyspark.RDD,
    scoring_model: Optional[ScoringModel],
) -> Tuple[pd.DataFrame, pyspark.RDD, FdrDiagnosticBundle]:
    """Compute FDR for database annotations and filter them."""

    moldb_formula_map_df = ion_formula_map_df[ion_formula_map_df.moldb_id ==
                                              moldb.id].drop('moldb_id',
                                                             axis=1)
    moldb_metrics_fdr_df = compute_fdr(fdr, formula_metrics_df,
                                       moldb_formula_map_df, scoring_model)

    if not moldb.targeted:
        max_fdr = 0.5
        moldb_metrics_fdr_df = moldb_metrics_fdr_df[
            moldb_metrics_fdr_df.fdr <= max_fdr]
    else:
        # fdr is not null for target ion formulas
        moldb_metrics_fdr_df = moldb_metrics_fdr_df[~moldb_metrics_fdr_df.fdr.
                                                    isnull()]

    moldb_ion_images_rdd = formula_images_rdd.filter(
        lambda kv: kv[0] in moldb_metrics_fdr_df.index  # pylint: disable=cell-var-from-loop
    )
    moldb_ion_metrics_df = moldb_metrics_fdr_df.merge(fdr.target_modifiers_df,
                                                      left_on='modifier',
                                                      right_index=True)

    # Extract the metrics for just this database, avoiding duplicates and handling missing rows
    all_metrics_df = formula_metrics_df.merge(
        moldb_formula_map_df.index.rename(
            'formula_i').drop_duplicates().to_frame(index=True)[[]],
        left_index=True,
        right_index=True,
        how='inner',
    )

    formula_map_df = (moldb_formula_map_df.drop(
        columns=['ion_formula']).rename_axis(index='formula_i').reset_index())
    fdr_bundle = FdrDiagnosticBundle(
        decoy_sample_size=fdr.decoy_sample_size,
        decoy_map_df=fdr.td_df,
        formula_map_df=formula_map_df,
        metrics_df=all_metrics_df,
    )

    return moldb_ion_metrics_df, moldb_ion_images_rdd, fdr_bundle
Ejemplo n.º 7
0
def filter_big_time_step(rdd: RDD, min_step: float, max_step: float) -> RDD:
    """ Some plane sometimes pass out for a long time (1k sec and more) without moving.
    We remove those planes from the records (SHALL WE ???)
    """
    def map_get_max_time_step(record):
        time = record.Time
        max_step = 0
        min_step = 0
        if len(time) > 1:
            arr = [t2 - t1 for t1, t2 in zip(time[:-1], time[1:])]
            max_step = np.max(arr)
            min_step = np.min(arr)
        return record, min_step, max_step

    rdd = rdd.map(map_get_max_time_step)

    rdd = rdd.filter(lambda r: r[1] >= min_step * 1000 and r[2] <= max_step * 1000) \
             .map(lambda r: r[0])  # remove from the record max and min steps

    # print(f"Applied filter on time steps of size {(min_step, max_step)}, remains {rdd.count()} records")
    return rdd
    pass
Ejemplo n.º 8
0
def clean_claims(claims: RDD, b_item_map: Broadcast):
    def clean(claim):
        item_map = b_item_map.value
        if claim.datatype == 'wikibase-item':
            if claim.object in item_map:
                claim = claim._replace(object=item_map[claim.object])
                return claim
            else:
                return None
        elif claim.datatype == 'quantity':
            unit = claim.object.unit
            unit = unit.split('/')[-1]
            if unit in item_map:
                claim = claim._replace(object=item_map[unit])
                return claim
            else:
                return None
        return claim

    dt_filter = {'wikibase-item', 'string', 'monolingualtext', 'quantity', 'time'}

    return claims.filter(lambda c: c.datatype in dt_filter).map(clean).filter(lambda c: c is not None)
Ejemplo n.º 9
0
    def normal_order(self, terms: RDD, **kwargs):
        """Normal order the terms according to generalized Wick theorem.

        The actual expansion is based on the information given in the subclasses
        by the abstract properties.

        """
        comparator = kwargs.pop('comparator', self.comparator)
        contractor = kwargs.pop('contractor', self.contractor)
        if len(kwargs) != 0:
            raise ValueError('Invalid arguments to Wick normal order', kwargs)

        phase = self.phase
        symms = self.symms
        resolvers = self.resolvers

        terms.cache()
        terms_to_proc = terms.filter(lambda x: len(x.vecs) > 1)
        keep_top = 0 if comparator is None else 1
        terms_to_keep = terms.filter(lambda x: len(x.vecs) <= keep_top)
        terms_to_proc.cache()
        if terms_to_proc.count() == 0:
            return terms_to_keep

        # Triples: term, contractions, schemes.
        wick_terms = terms_to_proc.map(lambda x: _prepare_wick(
            x, comparator, contractor, symms.value, resolvers.value))

        if self._wick_parallel == 0:

            normal_ordered = wick_terms.flatMap(lambda x: [
                _form_term_from_wick(x[0], x[1], phase, resolvers.value, i)
                for i in x[2]
            ])

        elif self._wick_parallel == 1:

            flattened = wick_terms.flatMap(
                lambda x: [(x[0], x[1], i) for i in x[2]])
            if self._num_partitions is not None:
                flattened = flattened.repartition(self._num_partitions)

            normal_ordered = flattened.map(lambda x: _form_term_from_wick(
                x[0], x[1], phase, resolvers.value, x[2]))

        elif self._wick_parallel == 2:

            # This level of parallelism is reserved for really hard problems.
            expanded = []
            for term, contrs, schemes in wick_terms.collect():
                # To work around a probable Spark bug.  Problem occurs when we
                # have closures inside a loop to be distributed out.
                form_term = functools.partial(_form_term_from_wick_bcast, term,
                                              contrs, phase, resolvers)

                curr = self._ctx.parallelize(schemes).map(form_term)
                expanded.append(curr)
                continue

            normal_ordered = self._ctx.union(expanded)

        else:
            raise ValueError('Invalid Wick expansion parallel level',
                             self._wick_parallel)

        return terms_to_keep.union(normal_ordered)
Ejemplo n.º 10
0
def remove_header(rdd: RDD) -> RDD:
    header_rdd: RDD = rdd.first()
    return rdd.filter(lambda row: row != header_rdd)