Esempio n. 1
0
    def transform(self, dirty_df: pd.DataFrame, col: str):
        char_features = self.char_counter.transform(
            dirty_df[col].values.tolist()).todense()
        # word_features = self.word_counter.transform(
        #     dataset.dirty_df[col].values.tolist()
        # ).todense()
        regex_features = self.regex_counter.transform([
            str2regex(val, match_whole_token=False)
            for val in dirty_df[col].values
        ]).todense()

        regex_features2 = self.regex_counter2.transform([
            str2regex(val, match_whole_token=True)
            for val in dirty_df[col].values
        ]).todense()

        # word_features =  self.word_counter.transform(
        #     dirty_df[col].values
        # ).todense()

        return [
            torch.tensor(
                np.concatenate(
                    [char_features, regex_features, regex_features2], axis=1))
        ]
Esempio n. 2
0
 def fit(self, dirty_df: pd.DataFrame, col: str):
     self.char_counter.fit(dirty_df[col].values.tolist())
     self.regex_counter.fit([
         str2regex(val, match_whole_token=False)
         for val in dirty_df[col].values
     ])
     self.regex_counter2.fit([
         str2regex(val, match_whole_token=True)
         for val in dirty_df[col].values
     ])
Esempio n. 3
0
    def get_coexist_counts(self, values):
        set_values = set(values)
        query = "{}\n" + "\n{}\n".join(
            [
                json.dumps(
                    {
                        "query": {
                            "term": {
                                "data": {
                                    "value": str2regex(val, match_whole_token=True)
                                }
                            }
                        }
                    }
                )
                for val in set_values
            ]
        )
        mresult = self.es.msearch(query, index="n_reversed_indices")

        indices_list = [ESQuery.get_results(res, "idx") for res in mresult["responses"]]

        coexist_count = defaultdict(lambda: {})

        for idx1, val1 in enumerate(values):
            for idx2, val2 in enumerate(values):
                if indices_list[idx1] is None or indices_list[idx2] is None:
                    coexist_count[val1][val2] = 0
                else:
                    coexist_count[val1][val2] = set(indices_list[idx1]).intersection(
                        indices_list[idx2]
                    )

        return coexist_count
Esempio n. 4
0
    def transform(self, dirty_df: pd.DataFrame, col: str):
        tfidf = self.tfidf.transform(dirty_df[col].values.tolist()).todense()

        sym_tfidf = self.sym_tfidf.transform(dirty_df[col].apply(
            lambda x: str2regex(x, match_whole_token=False)).values).todense()

        return [torch.tensor(np.concatenate([tfidf], axis=1))]
Esempio n. 5
0
    def fit(self, values):
        trigram = [["".join(x) for x in list(xngrams(val, 3))]
                   for val in values]
        ngrams = list(itertools.chain.from_iterable(trigram))
        self.trigram_counter = Counter(ngrams)
        sym_ngrams = [str2regex(x, False) for x in ngrams]

        self.sym_trigram_counter = Counter(sym_ngrams)
        self.val_counter = Counter(values)

        sym_values = [str2regex(x, False) for x in values]
        self.sym_val_counter = Counter(sym_values)

        self.func2counter = {
            val_trigrams: self.trigram_counter,
            sym_trigrams: self.sym_trigram_counter,
            value_freq: self.val_counter,
            sym_value_freq: self.sym_val_counter,
        }
Esempio n. 6
0
def sym_value_freq(values, counter):
    patterns = list(map(lambda x: str2regex(x, True), values))

    return value_freq(patterns, counter)
Esempio n. 7
0
def sym_trigrams(values, counter):
    patterns = list(map(lambda x: str2regex(x, False), values))
    return val_trigrams(patterns, counter)
Esempio n. 8
0
 def transform(self, dirty_df: pd.DataFrame, col):
     return (dirty_df[col].swifter.apply(lambda x: self.counter[str2regex(
         x, match_whole_token=True)] / len(dirty_df)).values)
Esempio n. 9
0
 def fit(self, dirty_df: pd.DataFrame, col):
     self.counter = (dirty_df[col].swifter.apply(lambda x: str2regex(
         x, match_whole_token=True)).value_counts().to_dict())
Esempio n. 10
0
 def fit(self, dirty_df: pd.DataFrame, col: str):
     self.tfidf.fit(dirty_df[col].values.tolist())
     self.sym_tfidf.fit(dirty_df[col].apply(
         lambda x: str2regex(x, match_whole_token=False)).values)
Esempio n. 11
0
def clean_str(x):
    x = x.strip().encode("ascii", "ignore").decode("ascii")
    return str2regex(x, True)