def get_multi_term_patterns(dataframe, col_with_prefixes, prefix_col): prefixes = dataframe[prefix_col] unique_prefixes = list(set(prefixes)) unique_prefixes.sort() inner_id_patterns = {} for one_unique in unique_prefixes: temp = list( dataframe[col_with_prefixes][dataframe[prefix_col] == one_unique]) extracted = rexpy.extract(temp) extracted = extracted[0] extracted = re.sub('^\\^', '', extracted) extracted = re.sub('\\$$', '', extracted) inner_id_patterns[one_unique] = extracted return inner_id_patterns
def pattern_discovery(dataset, file_name): # function to discover patterns from the input dataset and # generate the corresponding file from tdda import rexpy import collections from functools import reduce # dataset = dataset.select_dtypes(['object']) patterns = collections.defaultdict(list) list_pattern = list() listp = [] fn = './save/' + file_name + '_patterns.txt' # file = open(fn,"w") for c in dataset.columns.values: corpus = dataset[c].unique().astype('str').tolist() results = rexpy.extract(corpus) patterns[c].append(results) for i in range(0, len(patterns)): lp = list(patterns.items())[i][1] lp = reduce(lambda x, y: x + y, lp) p = reduce(lambda x, y: x + y, lp) list_pattern = (c, i, "'" + str(p) + "'") listp.append(list_pattern) # file.writelines(str(list_pattern)+ '\n') p = pd.DataFrame(listp, columns=['col', 'num', 'pattern']) p.to_csv(fn, header=('col', 'num', 'pattern'), index=False, sep=';') return p
def discover_id_pattern(example_ids): extracted = rexpy.extract(example_ids) extracted = extracted[0] extracted = re.sub('^\\^', '', extracted) extracted = re.sub('\\$$', '', extracted) return extracted
def find_rexes(self, colname, values=None): if values is None: return rexpy.pdextract(self.df[colname]) else: return rexpy.extract(values)
def find_rexes(self, colname, values=None): if not values: values = self.get_database_unique_values(self.tablename, colname) return rexpy.extract(values)
def find_rexes(self, colname, values=None, seed=None): if not values: values = self.get_database_unique_values(self.tablename, colname) return rexpy.extract(values, seed=seed)
from __future__ import print_function from tdda.rexpy import extract from tdda.rexpy.seq import common_string_sequence from tdda.rexpy.relib import re x = extract(['Roger', 'Coger', 'Doger'], tag=True, as_object=True) print(x) patternToExamples = x.pattern_matches() sequences = [] for j, (pattern, examples) in enumerate(patternToExamples.items()): N = len(examples) if N < 1: print('%s:%s' % (pattern, examples)) else: eparts = [re.match(x.results.rex[j], e).groups() for e in examples] nparts = len(eparts[0]) for i in range(nparts): (L, R) = (eparts[0][i], eparts[1][i]) n = 2 s = common_string_sequence(L, R) while n < N and s != '': s = common_string_sequence(s, eparts[n][i]) n += 1 sequences.append(s) print(sequences)
from tdda import rexpy corpus = ['123-AA-971', '12-DQ-802', '198-AA-045', '1-BA-834'] results = rexpy.extract(corpus) print('Number of regular expressions found: %d' % len(results)) for rex in results: print(' ' + rex)
def find_rexes(self, colname, values=None, seed=None): if values is None: return rexpy.pdextract(self.df[colname]) else: return rexpy.extract(values, seed=None)