Beispiel #1
0
def get_multi_term_patterns(dataframe, col_with_prefixes, prefix_col):
    prefixes = dataframe[prefix_col]
    unique_prefixes = list(set(prefixes))
    unique_prefixes.sort()
    inner_id_patterns = {}
    for one_unique in unique_prefixes:
        temp = list(
            dataframe[col_with_prefixes][dataframe[prefix_col] == one_unique])
        extracted = rexpy.extract(temp)
        extracted = extracted[0]
        extracted = re.sub('^\\^', '', extracted)
        extracted = re.sub('\\$$', '', extracted)
        inner_id_patterns[one_unique] = extracted
    return inner_id_patterns
Beispiel #2
0
def pattern_discovery(dataset, file_name):
    # function to discover patterns from the input dataset and
    # generate the corresponding file
    from tdda import rexpy

    import collections

    from functools import reduce

    # dataset = dataset.select_dtypes(['object'])
    patterns = collections.defaultdict(list)

    list_pattern = list()

    listp = []

    fn = './save/' + file_name + '_patterns.txt'
    # file = open(fn,"w")

    for c in dataset.columns.values:

        corpus = dataset[c].unique().astype('str').tolist()

        results = rexpy.extract(corpus)

        patterns[c].append(results)

        for i in range(0, len(patterns)):

            lp = list(patterns.items())[i][1]

            lp = reduce(lambda x, y: x + y, lp)

            p = reduce(lambda x, y: x + y, lp)

            list_pattern = (c, i, "'" + str(p) + "'")

            listp.append(list_pattern)

            # file.writelines(str(list_pattern)+ '\n')

    p = pd.DataFrame(listp, columns=['col', 'num', 'pattern'])

    p.to_csv(fn, header=('col', 'num', 'pattern'), index=False, sep=';')

    return p
Beispiel #3
0
def discover_id_pattern(example_ids):
    extracted = rexpy.extract(example_ids)
    extracted = extracted[0]
    extracted = re.sub('^\\^', '', extracted)
    extracted = re.sub('\\$$', '', extracted)
    return extracted
Beispiel #4
0
 def find_rexes(self, colname, values=None):
     if values is None:
         return rexpy.pdextract(self.df[colname])
     else:
         return rexpy.extract(values)
Beispiel #5
0
 def find_rexes(self, colname, values=None):
     if not values:
         values = self.get_database_unique_values(self.tablename, colname)
     return rexpy.extract(values)
Beispiel #6
0
 def find_rexes(self, colname, values=None, seed=None):
     if not values:
         values = self.get_database_unique_values(self.tablename, colname)
     return rexpy.extract(values, seed=seed)
Beispiel #7
0
from __future__ import print_function

from tdda.rexpy import extract
from tdda.rexpy.seq import common_string_sequence
from tdda.rexpy.relib import re

x = extract(['Roger', 'Coger', 'Doger'], tag=True, as_object=True)
print(x)

patternToExamples = x.pattern_matches()

sequences = []
for j, (pattern, examples) in enumerate(patternToExamples.items()):
    N = len(examples)
    if N < 1:
        print('%s:%s' % (pattern, examples))
    else:
        eparts = [re.match(x.results.rex[j], e).groups() for e in examples]
        nparts = len(eparts[0])
        for i in range(nparts):
            (L, R) = (eparts[0][i], eparts[1][i])
            n = 2
            s = common_string_sequence(L, R)
            while n < N and s != '':
                s = common_string_sequence(s, eparts[n][i])
                n += 1

            sequences.append(s)
print(sequences)

Beispiel #8
0
from tdda import rexpy

corpus = ['123-AA-971', '12-DQ-802', '198-AA-045', '1-BA-834']
results = rexpy.extract(corpus)
print('Number of regular expressions found: %d' % len(results))
for rex in results:
        print('   ' + rex)
Beispiel #9
0
 def find_rexes(self, colname, values=None, seed=None):
     if values is None:
         return rexpy.pdextract(self.df[colname])
     else:
         return rexpy.extract(values, seed=None)