def guess_input(raw): """Prints summary information about the input""" lines = raw.splitlines() print(f"# lines: {len(lines)}") line_lengths = set(filter(lambda x: x > 0, [len(line) for line in lines])) print(f"Line length range: {min(line_lengths)} to {max(line_lengths)}") print(f"# chars: {len(raw)}") double_newline = len(raw.split("\n\n")) - 1 print(f"# double newlines: {double_newline}") whitespace, tabs = False, False seen = [] for ch in raw: if ch == '\n': continue if ch == '\t': tabs = True elif ch.isspace(): whitespace = True elif ch not in seen: seen.append(ch) print(f"Contains tabs: {tabs}") print(f"Contains whitespace: {whitespace}") print(f"Chars: {''.join(sorted(seen))}") ints = list(extract_ints(raw, negative=True)) print(f"# Ints: {len(ints)}") if len(ints) > 0: print(f"Int range: {min(ints)} to {max(ints)}") ms = Multiset() for word in raw.split(): if word.isnumeric(): continue ms.add(word) common = sorted(ms.items(), key=by_index(1), reverse=True) print(f"Most common words: {common}") print()
def commutative_sequence_variable_partition_iter(values: Multiset, variables: List[VariableWithCount] ) -> Iterator[Dict[str, Multiset]]: """Yield all possible variable substitutions for given values and variables. .. note:: The results are not yielded in any particular order because the algorithm uses dictionaries. Dictionaries until Python 3.6 do not keep track of the insertion order. Example: For a subject like ``fc(a, a, a, b, b, c)`` and a pattern like ``f(x__, y___, y___)`` one can define the following input parameters for the partitioning: >>> x = VariableWithCount(name='x', count=1, minimum=1, default=None) >>> y = VariableWithCount(name='y', count=2, minimum=0, default=None) >>> values = Multiset('aaabbc') Then the solutions are found (and sorted to get a unique output): >>> substitutions = commutative_sequence_variable_partition_iter(values, [x, y]) >>> as_strings = list(str(Substitution(substitution)) for substitution in substitutions) >>> for substitution in sorted(as_strings): ... print(substitution) {x ↦ {a, a, a, b, b, c}, y ↦ {}} {x ↦ {a, a, a, c}, y ↦ {b}} {x ↦ {a, b, b, c}, y ↦ {a}} {x ↦ {a, c}, y ↦ {a, b}} Args: values: The multiset of values which are partitioned and distributed among the variables. variables: A list of the variables to distribute the values among. Each variable has a name, a count of how many times it occurs and a minimum number of values it needs. Yields: Each possible substitutions that is a valid partitioning of the values among the variables. """ if len(variables) == 1: yield from _commutative_single_variable_partiton_iter(values, variables[0]) return generators = [] for value, count in values.items(): generators.append(_make_variable_generator_factory(value, count, variables)) initial = dict((var.name, Multiset()) for var in variables) # type: Dict[str, 'Multiset[T]'] for subst in generator_chain(initial, *generators): valid = True for var in variables: if var.default is not None and len(subst[var.name]) == 0: subst[var.name] = var.default elif len(subst[var.name]) < var.minimum: valid = False break if valid: if None in subst: del subst[None] yield subst
def count(): db = Loader.load() cube = CubeLoader(db).load() # for printing in (printing for printing in set(cube.all_printings) if printing.front_face.artist.name=='Eric Deschamps'): # print(printing) artists = Multiset(printing.front_face.artist for printing in set(cube.all_printings)) for artist, multiplicity in sorted(artists.items(), key=lambda vs: vs[1]): print(artist, multiplicity)
def _match_commutative_operation(subject_operands, pattern, substitution, constraints, matcher): subjects = Multiset(subject_operands) # type: Multiset if not pattern.constant <= subjects: return subjects -= pattern.constant rest_expr = pattern.rest + pattern.syntactic needed_length = (pattern.sequence_variable_min_length + pattern.fixed_variable_length + len(rest_expr) + pattern.wildcard_min_length) if len(subjects) < needed_length: return fixed_vars = Multiset(pattern.fixed_variables) # type: Multiset[str] for name, count in pattern.fixed_variables.items(): if name in substitution: replacement = substitution[name] if issubclass(pattern.operation, AssociativeOperation) and isinstance( replacement, pattern.operation): needed_count = Multiset(substitution[name]) # type: Multiset else: if not isinstance(replacement, Expression): return needed_count = Multiset({replacement: 1}) if count > 1: needed_count *= count if not needed_count <= subjects: return subjects -= needed_count del fixed_vars[name] factories = [ _fixed_expr_factory(e, constraints, matcher) for e in rest_expr ] if not issubclass(pattern.operation, AssociativeOperation): for name, count in fixed_vars.items(): min_count, symbol_type = pattern.fixed_variable_infos[name] factory = _fixed_var_iter_factory(name, count, min_count, symbol_type, constraints) factories.append(factory) if pattern.wildcard_fixed is True: factory = _fixed_var_iter_factory(None, 1, pattern.wildcard_min_length, None, constraints) factories.append(factory) else: for name, count in fixed_vars.items(): min_count, symbol_type = pattern.fixed_variable_infos[name] if symbol_type is not None: factory = _fixed_var_iter_factory(name, count, min_count, symbol_type, constraints) factories.append(factory) expr_counter = Multiset(subjects) # type: Multiset for rem_expr, substitution in generator_chain((expr_counter, substitution), *factories): sequence_vars = _variables_with_counts(pattern.sequence_variables, pattern.sequence_variable_infos) if issubclass(pattern.operation, AssociativeOperation): sequence_vars += _variables_with_counts( fixed_vars, pattern.fixed_variable_infos) if pattern.wildcard_fixed is True: sequence_vars += (VariableWithCount( None, 1, pattern.wildcard_min_length), ) if pattern.wildcard_fixed is False: sequence_vars += (VariableWithCount(None, 1, pattern.wildcard_min_length), ) for sequence_subst in commutative_sequence_variable_partition_iter( Multiset(rem_expr), sequence_vars): if issubclass(pattern.operation, AssociativeOperation): for v in fixed_vars.distinct_elements(): if v not in sequence_subst: continue l = pattern.fixed_variable_infos[v].min_count value = cast(Multiset, sequence_subst[v]) if len(value) > l: normal = Multiset(list(value)[:l - 1]) wrapped = pattern.operation(*(value - normal)) normal.add(wrapped) sequence_subst[v] = normal if l > 1 else iter( normal).next() else: assert len( value ) == 1 and l == 1, u"Fixed variables with length != 1 are not supported." sequence_subst[v] = iter(value).next() try: result = substitution.union(sequence_subst) except ValueError: pass else: for i in _check_constraints(result, constraints): yield i
class CommutativePatternsParts(object): """Representation of the parts of a commutative pattern expression. This data structure contains all the operands of a commutative operation pattern. They are distinguished by how they need to be matched against an expression. All parts are represented by a :class:`.Multiset`, because the order of operands does not matter in a commutative operation. In addition, some lengths are precalculated during the initialization of this data structure so that they do not have to be recalculated later. This data structure is meant to be immutable, so do not change any of its attributes! Attributes: operation (Type[Operation]): The type of of the original pattern expression. Must be a subclass of :class:`.Operation`. constant (Multiset): A :class:`~.Multiset` representing the constant operands of the pattern. An expression is constant, if it does not contain variables or wildcards. syntactic (Multiset[Operation]): A :class:`.Multiset` representing the syntactic operands of the pattern. An expression is syntactic, if it does contain neither associative nor commutative operations nor sequence variables. Here, constant expressions and variables also get their own counters, so they are not included in this counter. sequence_variables (Multiset[str]): A :class:`.Multiset` representing the sequence variables of the pattern. Variables are represented by their name. Additional information is stored in ``sequence_variable_infos``. For wildcards without variable, the name will be ``None``. sequence_variable_infos (Dict[str, VarInfo]): A dictionary mapping sequence variable names to more information about the variable, i.e. its ``min_count`` and ``constraint``. fixed_variables (Multiset[VarInfo]): A :class:`.Multiset` representing the fixed length variables of the pattern. Here the key is a tuple of the form `(name, length)` of the variable. For wildcards without variable, the name will be `None`. fixed_variable_infos (Dict[str, VarInfo]): A dictionary mapping fixed variable names to more information about the variable, i.e. its ``min_count`` and ``constraint``. rest (Multiset): A :class:`.Multiset` representing the operands of the pattern that do not fall into one of the previous categories. That means it contains operation expressions, which are not syntactic. length (int): The total count of operands of the commutative operation pattern. sequence_variable_min_length (int): The total combined minimum length of all sequence variables in the commutative operation pattern. This is the sum of the `min_count` attributes of the sequence variables. fixed_variable_length (int): The total combined length of all fixed length variables in the commutative operation pattern. This is the sum of the `min_count` attributes of the variables. wildcard_fixed (Optional[bool]): Iff none of the operands is an unnamed wildcards, it is ``None``. Iff there are any unnamed sequence wildcards, it is ``True``. Otherwise, it is ``False``. wildcard_min_length (int): If :attr:`wildcard_fixed` is not ``None``, this is the total combined minimum length of all unnamed wildcards. """ def __init__(self, operation: Type[Operation], *expressions: Expression) -> None: """Create a CommutativePatternsParts instance. Args: operation: The type of the commutative operation. Must be a subclass of :class:`.Operation` with :attr:`~.Operation.commutative` set to ``True``. *expressions: The operands of the commutative operation. """ self.operation = operation self.length = len(expressions) self.constant = Multiset() # type: Multiset self.syntactic = Multiset() # type: Multiset self.sequence_variables = Multiset() # type: Multiset[str] self.sequence_variable_infos = dict() self.fixed_variables = Multiset() # type: Multiset[str] self.fixed_variable_infos = dict() self.rest = Multiset() # type: Multiset self.sequence_variable_min_length = 0 self.fixed_variable_length = 0 self.wildcard_min_length = 0 self.optional_count = 0 self.wildcard_fixed = None for expression in expressions: expression = expression if is_constant(expression): self.constant[expression] += 1 elif isinstance(expression, Wildcard): wc = cast(Wildcard, expression) if wc.variable_name: name = wc.variable_name if wc.fixed_size: self.fixed_variables[name] += 1 symbol_type = getattr(wc, 'symbol_type', None) self._update_var_info(self.fixed_variable_infos, name, wc.min_count, symbol_type, wc.optional) if wc.optional is None: self.fixed_variable_length += wc.min_count else: self.optional_count += 1 else: self.sequence_variables[name] += 1 self._update_var_info(self.sequence_variable_infos, name, wc.min_count, None, wc.optional) if wc.optional is None: self.sequence_variable_min_length += wc.min_count else: self.wildcard_min_length += wc.min_count if self.wildcard_fixed is None: self.wildcard_fixed = wc.fixed_size else: self.wildcard_fixed = self.wildcard_fixed and wc.fixed_size elif is_syntactic(expression): self.syntactic[expression] += 1 else: self.rest[expression] += 1 @staticmethod def _update_var_info(infos, name, count, symbol_type=None, default=None): if name not in infos: infos[name] = VarInfo(count, symbol_type, default) else: existing_info = infos[name] assert existing_info.min_count == count assert existing_info.type == symbol_type assert existing_info.default == default def __str__(self): parts = [] parts.extend(map(str, self.constant)) parts.extend(map(str, self.syntactic)) parts.extend(map(str, self.rest)) for name, count in self.sequence_variables.items(): parts.extend([name] * count) for name, count in self.fixed_variables.items(): parts.extend([name] * count) return '{}({})'.format( getattr(self.operation, 'name', self.operation.__name__), ', '.join(parts))
if item[1] >= min_bound: # Il 100 serve perchè l'intersection prende il numero di parole minore nel multiset # Vogliamo che il numero minore sia il numero di token count += 1 common_words.add(item[0], 100) print('second part') # process the data clean_data = [] for s in data: cleaner = s[1].intersection(common_words) clean_data.append([s[0], cleaner]) print('third part') output_data = [] for s, ms in clean_data: tokens = [] for item in ms.items(): for i in range(0, item[1]): tokens.append(item[0]) output_data.append(s + [tokens]) df = pd.DataFrame( output_data, columns=['band', 'album', 'year', 'song', 'genre', 'tokens']) df = df.drop(['band', 'album', 'year', 'song'], axis=1) df.to_csv('darklyrics-proc-tokens.csv', index=False)
word_sets = {} word_msets = {} letter_lists = [[{le: [] for le in letters} for n in range(N + 1)] for match in MATCH_RANGE] for wi, w in enumerate(words): word_s = set(w) word_sets[w] = word_s word_ms = Multiset(w) word_msets[w] = word_ms for n, le1 in enumerate(w): for le0 in letters_set.difference([le1]): letter_lists[0][n][le0].append(wi) letter_lists[1][n][le1].append(wi) for le2, n in word_ms.items(): letter_lists[2][n][le2].append(wi) for le2 in letters_set.difference(word_s): letter_lists[2][0][le2].append(wi) letter_sets = [[{le: set(letter_lists[match][n][le]) for le in letters} for n in range(N + 1)] for match in MATCH_RANGE] n_words = len(words) letter_sets_len = \ [[{le: len(letter_sets[match][n][le]) for le in letters} for n in range(N + 1)] for match in MATCH_RANGE] freqs = {(n, le): 0 for n in range(-1, N) for le in letters} for w, wd in zip(words, word_msets): for le in wd: freqs[(-1, le)] += 1
from multiset import Multiset x = Multiset() for i in range(10**10 + 1): x.update(str(i)) if i % 10000000 == 0: print(i) for digit, count in x.items(): if count == i: print(digit, i)
def ac_operand_lists(t1: Operation, t2: Operation)\ -> List[List[Tuple[Expression, Expression]]]: """Find all the sets of operand unification problems we can get from t1 and t2""" # Remove common operations t1_op_set = Multiset(t1.operands) t2_op_set = Multiset(t2.operands) common_ops = t1_op_set & t2_op_set t1_op_set -= common_ops t2_op_set -= common_ops t1_duplicate_vars = any( isinstance(e, Wildcard) and n > 1 for e, n in t1_op_set.items()) t2_duplicate_vars = any( isinstance(e, Wildcard) and n > 1 for e, n in t2_op_set.items()) if t1_duplicate_vars and t2_duplicate_vars: raise (NotImplementedError( "Possible nontermination on this algo, dispatch slowward")) # noqa elif t1_duplicate_vars or t2_duplicate_vars: print("Redundant solutions really gosh darn likely") ret = [] op_function = get_head(t1) t1_ops = to_ac_operands(t1_op_set) t2_ops = to_ac_operands(t2_op_set) all_t1_ops = t1_ops.consts + t1_ops.terms + t1_ops.vars all_t2_ops = t2_ops.consts + t2_ops.terms + t2_ops.vars t1_n_consts = len(t1_ops.consts) t2_n_consts = len(t2_ops.consts) t1_n_terms = len(t1_ops.terms) t2_n_terms = len(t2_ops.terms) t1_n_vars = len(t1_ops.vars) t2_n_vars = len(t2_ops.vars) t1_n_ops = len(all_t1_ops) t2_n_ops = len(all_t2_ops) t1_var_start = t1_n_ops - t1_n_vars t2_var_start = t2_n_ops - t2_n_vars t1_equal_consts = [ idx for idx in range(0, t1_n_consts - 1) if t1_ops.consts[idx] == t1_ops.consts[idx + 1] ] t2_equal_consts = [ idx for idx in range(0, t2_n_consts - 1) if t2_ops.consts[idx] == t2_ops.consts[idx + 1] ] t1_equal_terms = [ idx for idx in range(0, t1_n_terms - 1) if t1_ops.terms[idx] == t1_ops.terms[idx + 1] ] t2_equal_terms = [ idx for idx in range(0, t2_n_terms - 1) if t2_ops.terms[idx] == t2_ops.terms[idx + 1] ] t1_equal_vars = [ idx for idx in range(0, t1_n_vars - 1) if t1_ops.vars[idx] == t1_ops.vars[idx + 1] ] t2_equal_vars = [ idx for idx in range(0, t2_n_vars - 1) if t2_ops.vars[idx] == t2_ops.vars[idx + 1] ] for const_rows_true_idx in ints_walking_range(t2_var_start, t2_n_ops, t1_n_consts): # Drop clear violations of the repeat property here if some_pairs_sorted(const_rows_true_idx, t1_equal_consts): continue for const_cols_true_idx in ints_walking_range(t1_var_start, t1_n_ops, t2_n_consts): if some_pairs_sorted(const_cols_true_idx, t2_equal_consts): continue for term_rows_true_idx in ints_walking_range( t2_n_consts, t2_n_ops, t1_n_terms): if some_pairs_sorted(term_rows_true_idx, t1_equal_terms): continue for term_cols_true_idx in ints_walking_range( t1_n_consts, t1_n_ops, t2_n_terms): if some_pairs_sorted(term_cols_true_idx, t2_equal_terms): continue # Term mismatch if any(row_nr < t1_var_start and (term_rows_true_idx[ row_nr - t1_n_consts] != rel_col_nr + t2_n_consts) for rel_col_nr, row_nr in enumerate( term_cols_true_idx)): continue if any(col_nr < t2_var_start and (term_cols_true_idx[ col_nr - t2_n_consts] != rel_row_nr + t1_n_consts) for rel_row_nr, col_nr in enumerate( term_rows_true_idx)): continue set_cols = (set(const_rows_true_idx) | set(term_rows_true_idx)) set_rows = (set(const_cols_true_idx) | set(term_cols_true_idx)) for var_mat in all_boolean_matrices(t1_n_vars, t2_n_vars): # Filter out failures of unification if any(row_sum == 0 and raw_idx[0] + t1_var_start not in set_rows for raw_idx, row_sum in np.ndenumerate( np.sum(var_mat, axis=1))): continue if any(col_sum == 0 and raw_idx[0] + t2_var_start not in set_cols for raw_idx, col_sum in np.ndenumerate( np.sum(var_mat, axis=0))): continue if any( compare_equal_variable_vectors( i, var_mat[i, :], var_mat[i + 1, :], const_cols_true_idx, term_cols_true_idx) for i in t1_equal_vars): continue if any( compare_equal_variable_vectors( i, var_mat[:, i], var_mat[:, i + 1], const_rows_true_idx, term_rows_true_idx) for i in t2_equal_vars): continue operand_tuples = [] t1_var_unifiers = defaultdict( list ) # type: DefaultDict[Expression, List[Expression]] # noqa: E501 t2_var_unifiers = defaultdict( list ) # type: DefaultDict[Expression, List[Expression]] # noqa: E501 for const, var_idx in zip(t1_ops.consts, const_rows_true_idx): var = all_t2_ops[var_idx] t2_var_unifiers[var].append(const) for const, var_idx in zip(t2_ops.consts, const_cols_true_idx): var = all_t1_ops[var_idx] t1_var_unifiers[var].append(const) for term, var_idx in zip(t1_ops.terms, term_rows_true_idx): expr = all_t2_ops[var_idx] if isinstance(expr, Wildcard): t2_var_unifiers[expr].append(term) else: operand_tuples.append((term, expr)) for term, var_idx in zip(t2_ops.terms, term_cols_true_idx): expr = all_t1_ops[var_idx] if isinstance(expr, Wildcard): t1_var_unifiers[expr].append(term) # Else case handled above for idxs in np.transpose(np.nonzero(var_mat)): row = t1_ops.vars[idxs[0]] col = t2_ops.vars[idxs[1]] t1_var_unifiers[row].append(col) t2_var_unifiers[col].append(row) for d in [t1_var_unifiers, t2_var_unifiers]: for var, ops in d.items(): if len(ops) == 1: operand_tuples.append((var, ops[0])) else: operand_tuples.append( (var, op_function(*ops))) ret.append(operand_tuples) return ret