Ejemplo n.º 1
0
    def multi_tokens(self, headers):

        # 1. Create the output dataframe if it doesnt exist yet
        if self.__class__.df.shape[0] == 0:
            self.create_output(headers)
        # 2. Compute multi tokens
        if self.__class__.raw_text.empty:
            nlp_select = kex.NLPSelect(columns=headers)
            self.__class__.raw_text = nlp_select.transform(self.__class__.df)
        tex = kex.TokenExtractor()
        toks = tex.fit_transform(self.__class__.raw_text)
        tex2 = kex.TokenExtractor(ngram_range=(2, 2))
        vocab = kex.generate_vocabulary_df(tex)
        replaced_text = kex.token_to_alias(self.__class__.raw_text, vocab)
        toks2 = tex2.fit_transform(replaced_text)
        tokens = [token for token in (tex2.vocab_).tolist()]
        # 3. Create the vocab dataframe
        empty_array = np.chararray((len(tex2.vocab_)))
        empty_array[:] = ""
        data = np.column_stack(
            (tex2.vocab_, empty_array, empty_array, empty_array, tex2.scores_)
        )
        self.__class__.vocab_multi_df = pd.DataFrame(
            data=data, columns=self.__class__.vocab_columns
        )
        return tokens
Ejemplo n.º 2
0
    def uploadJSON(self, jsonstring, headers):

        rows = jsonstring.split("\n")
        del rows[-1]
        data = [
            np.array(
                [
                    '"{}"'.format(x)
                    for x in list(csv.reader([row], delimiter=",", quotechar='"'))[0]
                ]
            )
            for row in rows
        ]
        d = pd.DataFrame(data=data[1:], columns=data[0])
        d.columns = [c.replace('"', "") for c in d.columns.values]
        d.fillna(value='"', inplace=True)
        self.__class__.df = d.applymap(lambda x: x.replace('"', ""))

        # 1. Create the output dataframe
        self.create_output(headers)
        # 2. Compute single tokens
        nlp_select = kex.NLPSelect(columns=headers)
        self.__class__.raw_text = nlp_select.transform(self.__class__.df)
        tex = kex.TokenExtractor()
        toks = tex.fit_transform(self.__class__.raw_text)
        tokens = [token for token in (tex.vocab_).tolist()]
        # 3. Create the vocab dataframe
        empty_array = np.chararray((len(tex.vocab_)))
        empty_array[:] = ""
        data = np.column_stack(
            (tex.vocab_, empty_array, empty_array, empty_array, tex.scores_)
        )
        self.__class__.vocab_single_df = pd.DataFrame(
            data=data, columns=self.__class__.vocab_columns
        )

        # 2. Compute multi tokens
        if self.__class__.raw_text.empty:
            nlp_select = kex.NLPSelect(columns=headers)
            self.__class__.raw_text = nlp_select.transform(self.__class__.df)
        tex2 = kex.TokenExtractor(ngram_range=(2, 2))
        vocab = kex.generate_vocabulary_df(tex)
        replaced_text = kex.token_to_alias(self.__class__.raw_text, vocab)
        toks2 = tex2.fit_transform(replaced_text)
        tokens = [token for token in (tex2.vocab_).tolist()]
        # 3. Create the vocab dataframe
        empty_array = np.chararray((len(tex2.vocab_)))
        empty_array[:] = ""
        data = np.column_stack(
            (tex2.vocab_, empty_array, empty_array, empty_array, tex2.scores_)
        )
        self.__class__.vocab_multi_df = pd.DataFrame(
            data=data, columns=self.__class__.vocab_columns
        )
        return tokens
Ejemplo n.º 3
0
    def single_tokens(self, headers):

        # 1. Create the output dataframe
        self.create_output(headers)
        # 2. Compute single tokens
        nlp_select = kex.NLPSelect(columns=headers)
        self.__class__.raw_text = nlp_select.transform(self.__class__.df)
        tex = kex.TokenExtractor()
        toks = tex.fit_transform(self.__class__.raw_text)
        tokens = [token for token in (tex.vocab_).tolist()]
        # 3. Create the vocab dataframe
        empty_array = np.chararray((len(tex.vocab_)))
        empty_array[:] = ""
        data = np.column_stack(
            (tex.vocab_, empty_array, empty_array, empty_array, tex.scores_)
        )
        self.__class__.vocab_single_df = pd.DataFrame(
            data=data, columns=self.__class__.vocab_columns
        )
        return tokens
Ejemplo n.º 4
0
    def completeness(self):

        tex = kex.TokenExtractor()
        tag_df = kex.tag_extractor(
            tex,
            self.__class__.raw_text,
            vocab_df=self.__class__.vocab_single_df.replace(
                r"^\s*$", np.nan, regex=True
            )
            .set_index("tokens")
            .astype({"score": "float64"}),
        )
        tag_pct, tag_comp, tag_empt = kex.get_tag_completeness(tag_df)
        tag_pct_array = [tag for tag in tag_pct.items()]
        return (
            tag_comp.item(),
            tag_empt.item(),
            tag_pct_array,
            self.__class__.vocab_single_df.groupby("NE").nunique().alias.sum().item(),
            self.__class__.vocab_single_df[self.__class__.vocab_single_df.NE != ""]
            .NE.notna()
            .sum()
            .item(),
        )
Ejemplo n.º 5
0
    }
    nlp_select = kex.NLPSelect(
        columns=['OriginalShorttext'],
        special_replace=known_repl,
    )
    raw_text = nlp_select.transform(df)
    # raw_text, with token-->alias replacement
    replaced_text = kex.token_to_alias(raw_text, vocab)

    which_types = [  # only items/object within system, not actions/properties
        'I',
        # 'P', 'S',
        # 'U',
    ]

    tex = kex.TokenExtractor()
    toks = tex.fit_transform(replaced_text)
    tag_df = kex.tag_extractor(tex, replaced_text, vocab_df=vocab)[which_types]

    filt_tags = (tag_df.droplevel(0, axis=1).pipe(
        filter_tag_names, ['position', 'right']).pipe(filter_tag_occurrences,
                                                      ntags=args.ntags,
                                                      freq=args.freq,
                                                      topn=args.topn))
    voc_nodes = filt_tags.columns

    walks = extract_walk(replaced_text[filt_tags.index],
                         vocab[vocab.index.isin(filt_tags.columns)])
    logging.info(
        'Tag RW length breakdown:\n' +
        walks.apply(len).describe().round(2).to_csv(encoding='utf8', sep='\t'))