コード例 #1
0
    def __init__(self, lang="en", stop_words_path=None):
        if stop_words_path:
            self.__stop_words_pattern = self.build_stop_word_regex_from_file(
                stop_words_path)
        else:
            stoplist = stopwordsiso.stopwords(lang)
            if not stopwordsiso.has_lang(lang):
                lang2 = lang.split("-")[0].lower()
                if not stopwordsiso.has_lang(lang2):
                    raise ValueError(
                        "No bundled stopword list available for {lang}, "
                        "initialize Rake with stop_words_path "
                        "argument".format(lang=lang))
                stoplist = stopwordsiso.stopwords(lang2)

            self.__stop_words_pattern = self.build_stop_word_regex(stoplist)
コード例 #2
0
def clear(text):
    text = text.lower()
    text = re.sub(r"_+", "", text)
    text = re.sub(r"\b\d+\b", "", text)
    text = " ".join(
        [w for w in text.split() if not w in stopwords.stopwords("pt")])
    return text
コード例 #3
0
def remove_stopwords(doc,
                     langs='en',
                     extended_stopwords=None,
                     tokentype='lemma'):
    '''
    Remove stopwords
    '''
    if isinstance(langs, str):
        langs = [langs]

    stopword_list = set()
    for l in langs:
        stopword_list.update(list(stopwords(l)))

    if extended_stopwords:
        stopword_list.update(extended_stopwords)

    # STOPWORDS
    stop_ids = [
        idx for idx, value in enumerate(doc[tokentype])
        if value in stopword_list
    ]
    doc_sw_rm = remove_ids_all_keys(doc, stop_ids)

    return doc_sw_rm
def remove_stopwords(text, lang='et'):
    if lang == 'ee':
        lang = 'et'
    sw = stopwords(lang)
    for key in sw:
        text.replace(key, "")
    return text
コード例 #5
0
def getStopWords(spacy_model):
    """Stop words tokenized with the default raw analyzer."""
    # for languages available go to: https://github.com/stopwords-iso
    s_words = stopwords.stopwords('en')

    analyzer = partial(rawAnalyzer, spacy_model, [])
    return seq(s_words).flat_map(analyzer).to_list()
コード例 #6
0
def langmodelload(language, LibLocLang=CurLibLocLang):
    ##
    global model
    global stop_words
    global question_words
    ###
    if language == "en":
        model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe')
        question_words = ['where', 'which', "who", "why", "what", "when", "please", "how", "is", "are", "will", "could",
                          "should", "was", "were", "do", "did", "can"]
    elif language == "ar":
        model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe')
        question_words = ['أين', "أي", "من", "لماذا", "ماذا", "متى", "من فضلك", "كيف", "هي", "هي", "سوف", "يمكن", "يجب",
                          "كانت ", " كان ", " فعل ", " فعل ", " يمكنه "]
    elif language == "zh":
        model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe')
        question_words = ["哪里", "哪个", "谁", "为什么", "什么", "何时", "请", "如何", "是", "将", "可以", "应该", "被", "做"]
    elif language == "id":
        model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe')
        question_words = ['dimana', 'yang', "siapa", "mengapa", "apa", "ketika", "tolong", "bagaimana", "adalah",
                          "adalah", "akan", "bisa", "harus", "adalah", "adalah", "adalah", "lakukan ", " melakukan ",
                          " bisa "]
    elif language == "ko":
        model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe')
        question_words = ['어느', "누가 왜", "무엇", "언제", "제발", "어떻게", "는", "은", "의지", "할 수있다", "해야한다", "있었다", "있었다", "할",
                          "했다 ", "할 수있다"]
    elif language == "pt":
        model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe')
        question_words = ['onde', 'qual', "quem", "por que", "o que", "quando", "por favor", "como", "é", "vontade",
                          "poderia", "deveria", "era", "faz", "fez", "pode"]
    elif language == "vn":
        model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe')
        question_words = ['đâu', 'cái nào', "Ai", "tại sao", "gì", "khi", "làm ơn", "làm thế nào", "là", "là", "sẽ",
                          "có thể", "nên", "đã", "đã", "làm", "đã", "có thể "]
    ########################
    if stopwords.has_lang(language):
        ########################
        stop_words = list(stopwords.stopwords(language))
        stop_words_list = []
        ########################
        for i in range(0, len(stop_words)):
            try:
                sentences = model.tokenize(stop_words[i])
                ########
                for s in sentences:
                    model.tag(s)  # inplace tagging
                    model.parse(s)  # inplace parsing
                ########
                datause = pd.read_csv(StringIO(model.write(sentences, "conllu")), sep="\t", header=None, skiprows=4)
                PosTagIntention = datause[datause.columns[2:4]].values.tolist()
                if (PosTagIntention[0][1] != "NOUN") and (PosTagIntention[0][1] != "VERB") and (
                        PosTagIntention[0][1] != "PRON"):
                    stop_words_list.append(PosTagIntention[0][0])
            except:
                print()
        stop_words = stop_words_list
    else:
        print(language + " has errors.")
        stop_words = []
コード例 #7
0
def get_extractors(extractor_type = 'count'):
    if extractor_type == 'count':
        transformer = CountVectorizer(
            preprocessor = clean_text,
            stop_words = stopwords('ny'),
            lowercase = True
        )
    elif extractor_type == 'tfidf':
        transformer = TfidfVectorizer(preprocessor = clean_text, stop_words = stopwords("ny"), ngram_range=(1,2))
    elif extractor_type == 'tfidf-transformer':
        transformer = TfidfTransformer(use_idf = False)
    else:
        transformer = CountVectorizer(
            preprocessor = clean_text,
            stop_words = stopwords('ny')
        )
    return transformer 
def build_kw(path, lang='et'):
    outs = {}
    sw = stopwords(lang)
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip("\n")
            parsed = prepare(line, lang)
            if not parsed in sw:
                outs[parsed] = outs.get(parsed, []) + [line]
    return outs
コード例 #9
0
    def count_vectorizer(self):
        vectorizer = CountVectorizer(
            preprocessor = clean_text,
            stop_words = stopwords("ny"),
            ngram_range = (1, 2),
            min_df = 0.05,
        )
        train_features = vectorizer.fit_transform(self.train)
        test_features = vectorizer.transform(self.test)

        return train_features, test_features
コード例 #10
0
 def tfidf_transformer(self):
     vectorizer = CountVectorizer(
         preprocessor = clean_text,
         stop_words = stopwords("ny"),
         ngram_range = (1, 4),
         min_df = 0.05,
     )
     train_features = vectorizer.fit_transform(self.train)
     test_features = vectorizer.transform(self.test)
     transformer = get_extractors('tfidf-transformer')
     train_features = transformer.fit_transform(train_features)
     test_features = transformer.transform(test_features)
     return train_features, test_features
コード例 #11
0
 def get_lang_stopwords(self, lang = None):
     ## standardize the lang
     lang_stand = pycountry.languages.lookup(lang).alpha_2
     ## fetch stopwords
     if stopwords.has_lang(lang_stand):
         stop = stopwords.stopwords(lang_stand)
         if len(stop) > 1:
             ret = list(stop)
         else:
             ret = None
     else:
         ret = None
     return ret
コード例 #12
0
def urlParze(url):
    DetectorFactory.seed = 0
    print('attempting to querry' + url)
    try:
        response = requests.get(url, timeout=2)
        response.raise_for_status()
    except Exception as err:
        print(f'Error for: {url} occured')
        return (), 'zz'
    else:
        html = response.text
        text = text_from_html(html)
        if (len(text) < 100):
            return (), 'zz'
        lang = detect(text)
        #tokenizing text

        # THEN WE NEED TO MAKE DIFFERENT LOOPS DEPENDING ON THE LANGUAGE OF THE TEXT

        text = re.sub(r'[^\w\s]', '', text)

        # if language is english or indonesian
        if lang == 'en' or lang == 'id':
            text = nltk.word_tokenize(text)
            lowered = [x.lower() for x in text]
            if lang == 'en':
                lemmatizer = WordNetLemmatizer()
                output = [lemmatizer.lemmatize(x) for x in lowered]
            if lang == 'id':
                indLem = indLemm()
                output = [indLem.lemmatize(x) for x in lowered]
        elif lang == 'th':
            output = thaiword(text, keep_whitespace=False)
        elif lang == 'vi':
            output = list(chain.from_iterable(annotator.tokenize(text)))
        elif lang == 'ko':
            output = kParse.morphs(text)
        else:
            print("skipping because uknown language")
            return (), 'zz'
        stopL = set(stopwords(lang))
        out = [w for w in output if not w in stopL]

        #setting directory
        return tuple(output), lang
def build_kw_json(path, lang='et'):
    outs = {}
    sw = stopwords(lang)
    df = pd.read_csv(path, names=["kw"], dtype={})
    kw_df = df["kw"].astype(str).tolist()
    del df
    kws_ = set()
    for kw in tqdm(kw_df):
        #if lang == 'hr':
        #    kws_.add(kw)
        #else:
        for k in kw.split(';'):
            kws_.add(k.lower())
    return set(kws_)
    for line in list(kws_):
        parsed = prepare(line, lang)
        if not parsed in sw:
            outs[parsed] = outs.get(parsed, []) + [line]
    return outs
コード例 #14
0
                y='no of labels',
                kind='bar',
                legend=False,
                grid=True,
                figsize=(8, 8))
plt.title('Number of comments per category')
plt.ylabel('No of occurences')
plt.xlabel('Category')
#plt.show()
print()
#vocabulary = build_vocabulary()

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             norm='l2',
                             ngram_range=(1, 2),
                             stop_words=stopwords('ny'))
train_features = vectorizer.fit_transform(train_texts).toarray()
#test_features = vectorizer.transform(test_texts).toarray()
reduced_vocabulary = []
print('Transformed features shape: ', train_features.shape)
label_ids = train_data['Label_Id']

K = 600
for label_id, label in sorted(encoded_labels.items()):
    train_features_chi2 = chi2(train_features, label_ids == label_id)
    indices = np.argsort(train_features_chi2[0])
    feature_names = np.array(vectorizer.get_feature_names())[indices]

    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
コード例 #15
0
def clean(
    texts,
    language="en",
    min_token_freq=2,
    min_token_len=3,
    min_tokens=0,
    max_token_index=-1,
    min_ngram_count=3,
    ignore_words=None,
    remove_names=False,
    sample_size=1,
    verbose=True,
):
    """
    Cleans text body to prepare it for analysis

    Parameters
    ----------
        texts : str or list
            The texts to be cleaned and tokenized

        language : str (default=en)
            The language of Wikipedia to download

        min_token_freq : int (default=2)
            The minimum allowable frequency of a word inside the corpus

        min_token_len : int (default=3)
            The smallest allowable length of a word

        min_tokens : int (default=0)
            The minimum allowable length of a tokenized text

        max_token_index : int (default=-1)
            The maximum allowable length of a tokenized text

        min_ngram_count : int (default=5)
            The minimum occurrences for an n-gram to be included

        ignore_words : str or list
            Strings that should be removed from the text body

        remove_names : bool (default=False)
            Whether to remove common names

        sample_size : float (default=1)
            The amount of data to be randomly sampled

        verbose : bool (default=True)
            Whether to show a tqdm progress bar for the query

    Returns
    -------
        text_corpus, selected_idxs : list, list
            The texts formatted for text analysis as well as the indexes for selected entries
    """
    language = language.lower()

    # Select abbreviation for the lemmatizer, if it's available
    if language in languages.lem_abbr_dict().keys():
        language = languages.lem_abbr_dict()[language]

    if type(texts) == str:
        texts = [texts]

    if type(ignore_words) == str:
        ignore_words = [ignore_words]
    elif ignore_words == None:
        ignore_words = []

    if stopwords(language) != set():  # the input language has stopwords
        stop_words = stopwords(language)

    # Stemming and normal stopwords are still full language names
    elif language in languages.stem_abbr_dict().keys():
        stop_words = stopwords(languages.stem_abbr_dict()[language])

    elif language in languages.sw_abbr_dict().keys():
        stop_words = stopwords(languages.sw_abbr_dict()[language])

    else:
        stop_words = []

    pbar = tqdm(desc="Cleaning steps complete",
                total=7,
                unit="step",
                disable=not verbose)
    # Remove spaces that are greater that one in length
    texts_no_large_spaces = []
    for t in texts:
        for i in range(
                25, 0, -1
        ):  # loop backwards to assure that smaller spaces aren't made
            large_space = str(i * " ")
            if large_space in t:
                t = t.replace(large_space, " ")

        texts_no_large_spaces.append(t)

    texts_no_websites = []
    for t in texts_no_large_spaces:
        websites = [word for word in t.split() if word[:4] == "http"]

        for w in websites:
            t = t.replace(w, "")

        texts_no_websites.append(t)

    # Remove the references section but maintain the categories if they exist
    # The reference are in the text, so this just removes the section and external links
    # References are maintained for references like awards
    texts_no_references = []
    for t in texts_no_websites:
        if "Category:" in t:
            t = re.sub(r"(?<= ==References==).+?(?= Category)",
                       "",
                       t,
                       flags=re.DOTALL)
        else:
            t = t.split("==References==")[0]

        texts_no_references.append(t)

    gc.collect()
    pbar.update()

    texts_no_random_punctuation = []
    # Prevent words from being combined when a user types word/word or word-word or word:word
    for t in texts_no_references:
        t = t.replace("/", " ")
        t = t.replace("-", " ")
        t = t.replace(":", " ")  # split categories so they can be n-grammed
        t = re.sub("==[^>]+==", "", t)  # remove headers
        t = re.sub("< !--[^>]+-- >", "", t)  # remove comments

        texts_no_random_punctuation.append(t)

    texts_no_punctuation = []
    for r in texts_no_random_punctuation:
        texts_no_punctuation.append(
            r.translate(str.maketrans("", "", string.punctuation + "–" + "’")))

    # We lower case after names are removed to allow for filtering out capitalized words
    tokenized_texts = [text.split() for text in texts_no_punctuation]

    gc.collect()
    pbar.update()

    # Add bigrams and trigrams
    bigrams = Phrases(
        sentences=tokenized_texts,
        min_count=min_ngram_count,
        threshold=5.0,
        common_terms=stop_words,
    )  # half the normal threshold
    trigrams = Phrases(
        sentences=bigrams[tokenized_texts],
        min_count=min_ngram_count,
        threshold=5.0,
        common_terms=stop_words,
    )

    tokens_with_ngrams = []
    for text in tqdm(
            tokenized_texts,
            total=len(tokenized_texts),
            desc="n-grams generated",
            unit="texts",
            disable=not verbose,
    ):
        for token in bigrams[text]:
            if token.count("_") == 1:
                # Token is a bigram, so add it to the tokens
                text.insert(0, token)

        for token in trigrams[bigrams[text]]:
            if token.count("_") == 2:
                # Token is a trigram, so add it to the tokens
                text.insert(0, token)

        tokens_with_ngrams.append(text)

    gc.collect()
    pbar.update()

    args = zip(
        tokens_with_ngrams,
        [remove_names] * len(tokens_with_ngrams),
        [ignore_words] * len(tokens_with_ngrams),
    )

    num_cores = os.cpu_count()
    if __name__ == "wikirec.data_utils":
        with Pool(processes=num_cores) as pool:
            tokens_lower = list(
                tqdm(
                    pool.imap(_lower_remove_unwanted, args),
                    total=len(tokens_with_ngrams),
                    desc="Unwanted words removed",
                    unit="texts",
                    disable=not verbose,
                ))

    gc.collect()
    pbar.update()

    # Try lemmatization, and if not available stem, and if not available nothing
    nlp = None
    try:
        nlp = spacy.load(language)
        base_tokens = _lemmatize(tokens=tokens_lower, nlp=nlp, verbose=verbose)

    except OSError:
        try:
            os.system("python -m spacy download {}".format(language))
            nlp = spacy.load(language)
            base_tokens = _lemmatize(tokens=tokens_lower,
                                     nlp=nlp,
                                     verbose=verbose)

        except:
            pass

    if nlp == None:
        # Lemmatization failed, so try stemming
        stemmer = None
        if language in SnowballStemmer.languages:
            stemmer = SnowballStemmer(language)

        # Correct if the abbreviations were put in
        elif language == "ar":
            stemmer = SnowballStemmer("arabic")

        elif language == "fi":
            stemmer = SnowballStemmer("finish")

        elif language == "hu":
            stemmer = SnowballStemmer("hungarian")

        elif language == "sv":
            stemmer = SnowballStemmer("swedish")

        if stemmer != None:
            # Stemming instead of lemmatization
            base_tokens = []
            for tokens in tqdm(
                    tokens_lower,
                    total=len(tokens_lower),
                    desc="Texts stemmed",
                    unit="texts",
                    disable=not verbose,
            ):
                stemmed_tokens = [stemmer.stem(t) for t in tokens]
                base_tokens.append(stemmed_tokens)

        else:
            # We cannot lemmatize or stem
            base_tokens = tokens_lower

    gc.collect()
    pbar.update()

    token_frequencies = defaultdict(int)
    for tokens in base_tokens:
        for t in list(set(tokens)):
            token_frequencies[t] += 1

    if min_token_len == None or min_token_len == False:
        min_token_len = 0
    if min_token_freq == None or min_token_freq == False:
        min_token_freq = 0

    assert (type(min_token_len) == int
            ), "The 'min_token_len' argument must be an integer if used"
    assert (type(min_token_freq) == int
            ), "The 'min_token_freq' argument must be an integer if used"

    min_len_freq_tokens = []
    for tokens in base_tokens:
        min_len_freq_tokens.append([
            t for t in tokens if len(t) >= min_token_len
            and token_frequencies[t] >= min_token_freq
        ])

    gc.collect()
    pbar.update()

    # Save original length for sampling
    original_len = len(min_len_freq_tokens)
    min_sized_texts = [[i, t] for i, t in enumerate(min_len_freq_tokens)
                       if len(t) > min_tokens]

    args = zip(min_sized_texts, [max_token_index] * len(min_sized_texts))
    if __name__ == "wikirec.data_utils":
        with Pool(processes=num_cores) as pool:
            text_corpus = list(
                tqdm(
                    pool.imap(_subset_and_combine_tokens, args),
                    total=len(min_sized_texts),
                    desc="Texts finalized",
                    unit="texts",
                    disable=not verbose,
                ))

    gc.collect()

    # Sample texts
    if len(text_corpus) > int(sample_size * original_len):
        idxs = [t[0] for t in text_corpus]
        selected_idxs = np.random.choice(a=idxs,
                                         size=int(sample_size * original_len),
                                         replace=False)

    else:
        selected_idxs = [t[0] for t in text_corpus]

    text_corpus = [t[1] for t in text_corpus if t[0] in selected_idxs]
    pbar.update()

    return text_corpus, selected_idxs
コード例 #16
0
def preprocess(tweet, ascii=True, ignore_rt_char=True, ignore_url=True,
               ignore_mention=True, ignore_hashtag=True,
               letter_only=True, remove_stopwords=True, min_tweet_len=3,
               content_words=True, lang='es'):
               
  key_words = ["coronavirus","corona","virus","coronaoutbreak","covid-19","covid19","2019-ncov","2019ncov","sars-cov-2","sarscov2","cov-19","cov19","covd19","covd19"] # keywords
  sword_en = set(stopwords.words('english'))
  sword_es = set(stopwords.words('spanish'))
  stop_words_iso = set(stopwordsiso.stopwords(["es", "en"]))
  reserved_words = ["rt", "fav", "vía", "nofollow", "twitter", "true", "href", "rel"]
  stop_words_es = set(get_stop_words('es'))
  stop_words_en = set(get_stop_words('en'))
  sword = set()
  sword.update(sword_en)
  sword.update(sword_es)
  sword.update(stop_words_en)
  sword.update(stop_words_iso)
  sword.update(stop_words_es)
  sword.update(reserved_words)
  sword.update(key_words)
  #
  gn_early_exit = ["nicaragua"] # lang_detect interprets gn

  if ascii:  # maybe remove lines with ANY non-ascii character
    for c in tweet:
      if not (0 < ord(c) < 127):
        return ''

  #tokens = tag(tweet.lower()) #tweet.lower().split()  # to lower, split
  doc = nlp(tweet.lower())
  res = []

  for token in doc:
    t = token
    token = t.text
    pos = t.pos_
    if lang != 'es' and token in gn_early_exit:
      return ''
    if remove_stopwords and lang == 'es' and token in sword:
      continue
    if ignore_rt_char and token == 'rt':
      continue
    if ignore_url and token.startswith('https:'):
      continue
    if ignore_mention and token.startswith('@'):
      continue
    if ignore_hashtag and token.startswith('#'):
      continue
    if letter_only:
      if not token.isalpha():
        continue
    elif token.isdigit():
      token = '<num>'
    #POS 
    if content_words and lang == 'es' and pos not in ["NOUN","PROPN","ADV","ADJ","VERB"]: # es
      continue
    if content_words and lang != 'es' and get_tag(token) not in ['n','v','adj','adv'] and pos not in ["NOUN","PROPN","ADV","ADJ","VERB"]: # gn
      continue
    #
      
    token = t.lemma_ if lang == 'es' else get_stem(token, True)
    res += token,

  #min_tweet_len
  if min_tweet_len and len(res) < min_tweet_len:
    return ''
  else: 
    return ' '.join(res)
コード例 #17
0
)
from rubrix._constants import MAX_KEYWORD_LENGTH
from stopwordsiso import stopwords

from .api import EsRecordDataFieldNames

SUPPORTED_LANGUAGES = ["es", "en", "fr", "de"]
DATASETS_RECORDS_INDEX_TEMPLATE = {
    "settings": {
        "number_of_shards": settings.es_records_index_shards,
        "number_of_replicas": settings.es_records_index_replicas,
        "analysis": {
            "analyzer": {
                "multilingual_stop_analyzer": {
                    "type": "stop",
                    "stopwords": [w for w in stopwords(SUPPORTED_LANGUAGES)],
                }
            }
        },
    },
    "index_patterns": [DATASETS_RECORDS_INDEX_NAME.format("*")],
    "mappings": {
        "properties": {
            "event_timestamp": {"type": "date"},
            EsRecordDataFieldNames.words: {
                "type": "text",
                "fielddata": True,
                "analyzer": "multilingual_stop_analyzer",
            },
            # TODO: Not here since is task dependant
            "tokens": {"type": "text"},
コード例 #18
0
def _get_stopwords(lang: str) -> AbstractSet[str]:
    result = set(stopwords(lang))

    if lang == "en":
        result.update(("'m", "'re", "'s", "'ve", "n't", "nt", "n’t", "’m",
                       "’re", "’s", "’ve"))
        result.difference_update((
            "case",
            "cases",
            "help",
            "home",
            "information",
            "man",
            "million",
            "new",
            "novel",
            "state",
            "states",
            "system",
            "today",
            "uk",
            "work",
            "world",
            "year",
            "years",
        ))

    elif lang == "de":
        result.update((
            "bleiben",
            "ca.",
            "echt",
            "eher",
            "eigentlich",
            "fast",
            "fest",
            "genau",
            "halt",
            "klar",
            "ne",
            "paar",
            "sogar",
            "trotz",
            "wahrscheinlich",
        ))
        result.difference_update((
            "ernst",
            "jahr",
            "jahre",
            "jahren",
            "mensch",
            "menschen",
            "neuen",
            "tag",
            "tage",
            "uhr",
            "wissen",
            "zeit",
        ))

    return result
コード例 #19
0
import string
import stopwordsiso as stopwords


app = Flask(__name__)

#Loading some model
categor = pd.read_csv('category.csv')
nb = pickle.load(open("random_forest_classi.pkl","rb"))
cv = pickle.load(open("cv_content.pkl","rb"))
cv_head = pickle.load(open("cv_head.pkl","rb"))
col_transform = pickle.load(open("one_hot.pkl","rb"))


# Stop words
stop_words = stopwords.stopwords("bn")



#  #####################  Function section  ########################

# NLP Preprocess function

# Apply a first round of text cleaning techniques
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text
コード例 #20
0
for label in all_labels:
    mask = train_data['Label'] == label
    label_count.append((label,len(train_data[mask])))

le = LabelEncoding(all_labels)
train_data, encoded_labels = le.encode(train_data)
data_stats = pd.DataFrame(label_count, columns=['category','no of labels'])
data_stats.plot(x='category',y='no of labels', kind='bar', legend=False, grid=True, figsize=(8, 8))
plt.title('Number of comments per category')
plt.ylabel('No of occurences')
plt.xlabel('Category')
#plt.show()
print()
#vocabulary = build_vocabulary()

vectorizer = TfidfVectorizer(sublinear_tf = True, norm = 'l2', ngram_range = (1,2), stop_words = stopwords('ny'))
train_features = vectorizer.fit_transform(train_texts).toarray()
#test_features = vectorizer.transform(test_texts).toarray()
reduced_vocabulary = []
print('Transformed features shape: ',train_features.shape)
label_ids = train_data['Label_Id']

K = 900
for label_id, label in sorted(encoded_labels.items()):
    train_features_chi2 = chi2(train_features, label_ids == label_id)
    indices = np.argsort(train_features_chi2[0])
    feature_names = np.array(vectorizer.get_feature_names())[indices]

    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
コード例 #21
0
from cltk.stop.classical_hindi.stops import STOPS_LIST

# Cell
tok = WordTokenizer(language='multilingual')
## libraries that can be used
hi_stopwords = []
with open('../Data/Data/hindi_stopwords.txt', 'r') as fp:
    for w in fp.readlines():
        hi_stopwords.append(str(w[:-1]))
puncts = [
    ">", "+", ":", ";", "*", "’", "●", "•", "-", ".", "''", "``", "'", "|",
    "​", "!", ",", "@", "?", "\u200d", "#", "(", ")", "|", "%", "।", "=", "``",
    "&", "[", "]", "/", "'"
]
stop_for_this = hi_stopwords + list(
    stopwords.stopwords(["en", "hi", "ta", "te", "bn"])) + [
        "आएगा", "गए", "गई", "करे", "नही", "हम", "वो", "follow", "दे", "₹",
        "हर", "••••", "▀▄▀", "नही", "अब", "व्हाट्सएप", "॥", "–", "ov", "डॉ",
        "ॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐॐ", "क्या", "जी", "वो", "╬═╬", "_",
        "backhand_index_pointing_down", "backhand_index_pointing_right",
        "link", "subscribe", "backhand_index_pointing_down_light_skin_tone",
        "backhand_index_pointing_up", "Whatsapp", "Follow", "Tweet",
        "सब्सक्राइब", "Link", "\'\'", "``", "________________________________",
        "_________________________________________"
    ]


# Cell
def preprocess_sent(
    sent,
    params={
コード例 #22
0
def langmodelload(language):
    ########################
    global stop_words
    global question_words
    global embeddings
    global model
    global lang_dict
    ########################
    LibLocLang = "./udpipe-ud/"
    ########################
    if language == "en":
        model = Model(LibLocLang + 'english-ewt-ud-2.5-191206.udpipe')
    elif language == "ar":
        model = Model(LibLocLang + 'arabic-padt-ud-2.5-191206.udpipe')
    elif language == "zh":
        model = Model(LibLocLang + 'chinese-gsdsimp-ud-2.5-191206.udpipe')
    elif language == "id":
        model = Model(LibLocLang + 'indonesian-gsd-ud-2.5-191206.udpipe')
    elif language == "ko":
        model = Model(LibLocLang + 'korean-gsd-ud-2.5-191206.udpipe')
    elif language == "pt":
        model = Model(LibLocLang + 'portuguese-gsd-ud-2.5-191206.udpipe')
    elif language == "vi":
        model = Model(LibLocLang + 'vietnamese-vtb-ud-2.5-191206.udpipe')
    elif language == "hi":
        model = Model(LibLocLang + 'hindi-hdtb-ud-2.5-191206.udpipe')
    elif language == "jp":
        model = Model(LibLocLang + 'japanese-gsd-ud-2.5-191206.udpipe')
    elif language == 'es':
        model = Model(LibLocLang + 'spanish-gsd-ud-2.5-191206.udpipe')
    ########################
    base_question_words = [
        'where', 'which', "who", "why", "what", "when", "please", "how", "is",
        "are", "will", "could", "should", "was", "were", "do", "did", "can"
    ]
    question_words = []
    for i in range(0, len(base_question_words)):
        question_words.append(
            Text(base_question_words[i]).transliterate(language))
    ########################
    if stopwords.has_lang(
            language
    ) and language != "hi" and language != "ar" and language != "zh" and language != "vi" and language != "ko" and language != "jp" and language != "id" and language != "ms":
        ########################
        stop_words = list(stopwords.stopwords(language))
        stop_words_list = []
        ########################
        for i in range(0, len(stop_words)):
            try:
                text = Text(stop_words[i], hint_language_code=language)
                ########################
                if (text.pos_tags[0][1] != "NOUN") and (
                        text.pos_tags[0][1] != "VERB") and (text.pos_tags[0][1]
                                                            != "PRON"):
                    stop_words_list.append(text.pos_tags[0][0])
            except Exception as e:
                print(e)
        stop_words = stop_words_list
    else:
        print(language + " has errors.")
        stop_words = []
    ########################
    ########################

    embeddings = Embedding.load("./polyglot_data/embeddings2/" + language +
                                "/embeddings_pkl.tar.bz2")
    lang_dict[language] = {
        'model': model,
        'embeddings': embeddings,
        'stop_words': stop_words
    }
コード例 #23
0
import lemmy
import lemmy.pipe
import nltk
from polyglot.text import Text
import pycld2 as cld2
import pandas as pd
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import stopwordsiso as stopwords

lemmatizer = lemmy.load("da")

# Stop words + cumstoms
stopwordlist = stopwords.stopwords("da")
stopwordlist.update([
    'du', 'og', 'til', 'kan', 'vores', 'brug', 'dine', 'første', 'ved', 'find',
    'dit', 'mere', 'blevet', 'tager', 'søg', 'http', 'dk', 'søg', 'læs'
])

# Open file and lower case letters
with open("pfa.txt", "r") as file:
    text = file.read().lower()

# Remove numbers from text
text = ''.join([i for i in text if not i.isdigit()])

# Remove all special characters
text = re.sub(r'[-()\"#_/@;:<>{}`+=~|.!?,]', ' ', text)
コード例 #24
0
 def __init__(self, config):
     self.all_stopwords = stopwords(["en", "zh"])
     use_cuda_flag = config.get("use_cuda", False)
     self.model = LAC(mode='seg', use_cuda=use_cuda_flag)
コード例 #25
0
        list_of_terms = []
        with open(f"{sourcedir}/{domain}_{lang}_terms_nes.ann",
                  "r", encoding="utf-8") as f:
            for line in f.readlines():
                s = str(line).replace("OOD_Term", "").replace("Common_Term", "").replace("Specific_Term", "").replace(
                    "Named_Entity", "").strip("\n").strip("\t")
                list_of_terms.append(s)

        kwp.add_keywords_from_list(list_of_terms)

        # Remove unwanted terms from list (single letters, prepositions, stop-words etc.)
        abc_list = list(string.ascii_uppercase + string.ascii_lowercase)
        kwp.remove_keywords_from_list(abc_list)
        kwp.remove_keywords_from_list(word_boundary_list)

        for i in stopwords(f"{lang}"):
            kwp.remove_keyword(i)
            kwp.remove_keyword(i.capitalize())

        # Extract the terms
        with open(f"{outdir}/{lang}{suffix1}/{domain}{suffix2}full_tok.txt", "r", encoding="utf-8") as f:
            sentences = f.readlines()
#        print(sentences[-10:])
        results =[]
        for line in tqdm(sentences):
            s = kwp.extract_keywords(line.rstrip())
            results.append(s)
        print(results[-10:])

        # Remove previously added terms from keyword processor
        for i in list_of_terms:
コード例 #26
0
    def __init__(self, config: dict):
        self.languages = json.loads(config['general']['languages'])
        self.chunksize = int(config['general']['chunksize'])
        self.rebuild_entire_database = config['general'][
            'rebuild_entire_database'].lower() == 'true'
        self.process_new_files_only = config['general'][
            'process_new_files_only'].lower() == 'true'

        self.data_dir = self.create_dir(ROOT_DIR, config['dir']['data_dir'])
        self.progress_dir = self.create_dir(self.data_dir,
                                            config['dir']['progress_dir'])
        self.spider_specific_dir = self.create_dir(
            ROOT_DIR, config['dir']['spider_specific_dir'])

        self.spiders_dir = self.create_dir(self.data_dir,
                                           config['dir']['spiders_subdir'])
        self.spacy_subdir = self.create_dir(self.data_dir,
                                            config['dir']['spacy_subdir'])
        self.datasets_subdir = self.create_dir(
            self.data_dir, config['dir']['datasets_subdir'])
        self.tmp_subdir = self.create_dir(self.data_dir,
                                          config['dir']['tmp_subdir'])

        self.corpora_subdir = self.create_dir(self.data_dir,
                                              config['dir']['corpora_subdir'])
        self.slc_subdir = self.create_dir(self.corpora_subdir,
                                          config['dir']['slc_subdir'])
        self.slc_spacy_subdir = self.create_dir(self.slc_subdir,
                                                config['dir']['spacy_subdir'])
        self.jureko_subdir = self.create_dir(self.corpora_subdir,
                                             config['dir']['jureko_subdir'])
        self.jureko_spacy_subdir = self.create_dir(
            self.jureko_subdir, config['dir']['spacy_subdir'])
        self.wikipedia_subdir = self.create_dir(
            self.corpora_subdir, config['dir']['wikipedia_subdir'])
        self.wikipedia_spacy_subdir = self.create_dir(
            self.wikipedia_subdir, config['dir']['spacy_subdir'])
        self.spider_specific_dir = self.create_dir(
            ROOT_DIR, config['dir']['spider_specific_dir'])
        self.output_dir = self.create_dir(self.data_dir,
                                          config['dir']['output_subdir'])

        self.legal_info_dir = self.create_dir(ROOT_DIR,
                                              config['dir']['legal_info_dir'])

        self.ip = config['postgres']['ip']
        self.port = config['postgres']['port']
        self.user = config['postgres']['user']
        self.password = config['postgres']['password']
        self.db_scrc = config['postgres']['db_scrc']
        self.db_jureko = config['postgres']['db_jureko']
        self.db_slc = config['postgres']['db_slc']
        self.db_wikipedia = config['postgres']['db_wikipedia']

        self.indexes = json.loads(config['postgres']['indexes'])

        self.num_cpus = multiprocessing.cpu_count()

        self.stopwords = stopwords(self.languages)
        # this should be filtered out by PUNCT pos tag already, but sometimes they are misclassified
        self.stopwords |= {' ', '.', '!', '?'}

        self.counter_types = ['counter_lemma', 'counter_pos', 'counter_tag']
コード例 #27
0
 def tfidf_vectorizer(self):
     vectorizer = TfidfVectorizer(preprocessor = clean_text, stop_words=stopwords("ny"), ngram_range=(1,2), sublinear_tf=True, min_df=0.05, norm='12')
     train_features = vectorizer.fit_transform(self.train)
     test_features = vectorizer.transform(self.test)
     return train_features, test_features
コード例 #28
0
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.sklearn
import matplotlib.pyplot as plt

#from nltk.corpus import stopwords
#stop_words = stopwords.words('danish')

import stopwordsiso as stopwords
stopwords.langs()  # return a set of all the supported languages
stopwords.has_lang("da")  # check if there is a stopwords for the language
stopwords.stopwords("da")  # danish stopwords

import pandas as pd
import numpy as np
import scipy as sp
import sklearn
import sys
#from nltk.corpus import stopwords;
import nltk
from gensim.models import ldamodel
import gensim.corpora
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle
from gensim.models import CoherenceModel
コード例 #29
0
ファイル: predict.py プロジェクト: slowwavesleep/AntidictAPI
import pickle
import gensim
from razdel import tokenize
import regex
import stopwordsiso
from typing import List, Union, Dict, Any, Set

stops = set("""чей свой из-за вполне вообще вроде сюда аж той
россия россии россию россией путин путина путину путиным путине
даю даешь дает даем даете дают""".split())
stops = stops | stopwordsiso.stopwords("ru")

with open("models/classifier.pkl", "rb") as file:
    loanword_clf = pickle.load(file)
with open("models/cb_classifier.pkl", "rb") as file:
    obscene_clf = pickle.load(file)
with open("models/expressive_classifier.pkl", "rb") as file:
    expressive_clf = pickle.load(file)

model = gensim.models.KeyedVectors.load(
    "models/fasttext/araneum_none_fasttextcbow_300_5_2018.model")


def statistics(analysis: List[dict]) -> dict:
    total = len(analysis)
    loanword = len([t for t in analysis if t["loanword"]])
    obscene = len([t for t in analysis if t["obscene"]])
    expressive = len(
        [t for t in analysis if (t["obscene"] or t["expressive"])])
    stats = {
        "loanword_ratio": loanword,
コード例 #30
0
ファイル: tool_01.py プロジェクト: canislatranscoxus/redis
 def get_words(self):
     result = stopwords.stopwords(self.language)
     return result