예제 #1
0
def stem_words(iterable, language='english'):
    """Stem every word in iterable.

    Uses PyStemmer which is based on the Porter Stemming algorithms -
    an algorithm for suffix stripping.

    https://tartarus.org/martin/PorterStemmer/def.txt

    :rtype: list.
    """
    try:
        stemmer = Stemmer(language)
    except KeyError:
        stemmer = Stemmer('english')
    return stemmer.stemWords(iterable)
예제 #2
0
def stemmer(tokens):
    # ps = PorterStemmer()
    # tokens = [ps.stem(w) for w in tokens]

    ps = Stemmer('porter')
    tokens = [ps.stemWord(w) for w in tokens]
    return tokens
def textHandler(text):
    #print(text)
    #stop_word = {}
    #tokenizing
    text = text.encode('ascii', errors='ignore').decode()
    text = re.sub(r'[^A-Za-z0-9]+', r' ', text)
    #tokens = nltk.word_tokenize(text)#tokenizing
    #stop word removal
    #uwords = [word for word in tokens if word not in stop_word.keys()]#stop word removal
    #print('remove',uwords)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filter_sentence = [w for w in word_tokens if not w in stop_words]
    # filter_sentence = []
    # for w in word_tokens:
    #     if w not in stop_words:
    #         filter_sentence.append(w)

    stemmer = Stemmer('porter')
    stem_text = []
    for word in filter_sentence:
        stem_text.append(stemmer.stemWord(word))
    #print(filter_sentence)
    # print('before',len(filter_sentence))
    # print('after',len(stemming(stem_text)))
    return stem_text
예제 #4
0
 def __init__(self):
     self.lexicon = {}            #lexicon for assisting in search
     self.titles = {}                #document titles
     self.stop_words = {}
     self.stemmer = Stemmer("english")  # for stemming of words
     self.totalDocs = 127467                     # total counts of all pages found in our document ( please update this count according to your dataset)
     self.load()
예제 #5
0
def stem(datalist):  #Stemming
    stemmer = Stemmer("english")
    tmp = []
    for x in datalist:
        y = stemmer.stemWord(x)
        tmp.append(y)
    return tmp
예제 #6
0
def text_cleaner(text):
    text = text.lower()  # приведение в lowercase,

    text = re.sub(r'https?://[\S]+', ' url ', text)  # замена интернет ссылок
    text = re.sub(r'[\w\./]+\.[a-z]+', ' url ', text)

    text = re.sub(r'\d+[-/\.]\d+[-/\.]\d+', ' date ',
                  text)  # замена даты и времени
    text = re.sub(r'\d+ ?гг?', ' date ', text)
    text = re.sub(r'\d+:\d+(:\d+)?', ' time ', text)

    # text = re.sub( r'@\w+', ' tname ', text ) # замена имён twiter
    # text = re.sub( r'#\w+', ' htag ', text ) # замена хештегов

    text = re.sub(r'<[^>]*>', ' ', text)  # удаление html тагов
    text = re.sub(r'[\W]+', ' ', text)  # удаление лишних символов

    stemmer = Stemmer('russian')
    text = ' '.join(stemmer.stemWords(text.split()))

    stw = [
        'в', 'по', 'на', 'из', 'и', 'или', 'не', 'но', 'за', 'над', 'под',
        'то', 'a', 'at', 'on', 'of', 'and', 'or', 'in', 'for', 'at'
    ]
    remove = r'\b(' + '|'.join(stw) + ')\b'
    text = re.sub(remove, ' ', text)

    text = re.sub(r'\b\w\b', ' ', text)  # удаление отдельно стоящих букв

    text = re.sub(r'\b\d+\b', ' digit ', text)  # замена цифр

    return text
예제 #7
0
def text_cleaner(text: str):
    text = text.lower()
    stemmer = Stemmer("russian")  # Выбор языка на котором будут входные данные
    text = " ".join(stemmer.stemWords(text.split()))
    text = re.sub(r"\b\d+\b", "digit",
                  text)  # По идее заменяет цифры ! (на что пока не понял)
    return text
예제 #8
0
def tokenise(value, identifier, category, content_stop):
    token_list = []
    final_list = []
    value = re.sub(exclude1, " ", value)
    value = re.sub(exclude2, " ", value)
    value = re.sub(r'[^a-zA-Z]', " ", value)
    value = value.lower()
    if category == 'e':
        value = re.sub(r'(http|www|com)', " ", value)
    if category == 'c':
        value = re.sub(r'category', " ", value)
    token_list = value.split()
    for w in token_list:
        if w not in content_stop.keys():
            final_list.append(w)
#    stemmer = PorterStemmer()
    stemmer = Stemmer("english")
    final_list = [stemmer.stemWord(key) for key in final_list]
    #    final_list = [stemmer.stem(plural,0, len(plural)-1) for plural in final_list]
    if final_list:
        #call next function here.
        return (final_list)
    ####after work of token_list is done####
    token_list = []
    final_list = []
예제 #9
0
 def _prepare_text(self, text):
     """Extracts and stems the words from some given text.
     """
     words = re.findall('[a-z0-9\']+', text.lower())
     words = [word for word in words if word not in STOP_WORDS]
     stemmer = Stemmer('english')
     stemmed_words = stemmer.stemWords(words)
     return stemmed_words
def get_stemmer(language=None):
    """
    Return stemmer for given language.

    The default language is english.
    """

    return Stemmer(language or 'english')
예제 #11
0
def main(inputFile):
  print("Start making emoticon map")
  map_emoticon = generateEmotMap('emoticon_id.txt')
  print("Finished")

  print("Start making senti map")
  map_senti = generateSentiMap(['boosterwords_id.txt', 'idioms_id.txt', 'negatingword.txt', 'sentiwords_id.txt'])
  print("Finished")
  
  print("Start making abbreviation dictionary for bahasa")
  corrector = Corrector('singkatan.dic')
  print("Finished")

  print("Start making stopword dictionary for bahasa")
  cutter = Cutter('stopword.txt')
  print("Finished")

  print("Start making stemmer for bahasa")
  stemmer = Stemmer()
  print("Finished")
  
  output_file = sys.argv[2] + '.txt'
  file_read = open(str(inputFile), "r", encoding='utf-8')
  file_write = open(output_file, "w", encoding='utf-8')
  start = timeit.default_timer()
  review_number = 0;
  for line in file_read.readlines():
    review_number += 1
    user_rating = line.split('<>')[0]
    sentence_number = 0
    header_string = 'REVIEW-' + str(review_number) + ' ' + str(user_rating)
    body_string = ''
    # file_write.write('REVIEW-' + str(review_number) + ' [rating] ' + str(user_rating) + '\n')
    review = line.split('<>')[1]
    review = erase_question_sentence(review) # Erase question sentence
    for i, sentence in enumerate(re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', review)):
      print("Processing sentence " + str(i+1) + ' from review ' + str(review_number))
      sentence = sentence.lower()
      sentence = corrector.correct(sentence, map_emoticon, map_senti).strip()
      sentence = cutter.cut(sentence, map_emoticon, map_senti).strip()
      sentence = stemmer.stem(sentence, map_emoticon, map_senti).strip()
      if (sentence != ''):
        # file_write.write(sentence + "\n")
        body_string = body_string + sentence + '\n'
        sentence_number += 1
    header_string = header_string + ' ' + str(sentence_number) + '\n'
    output_string = header_string + body_string
    file_write.write(output_string)
  stop = timeit.default_timer()
  print("Running time: " + str(stop - start))
  print("Finished.\nOutput file: " + output_file)
  file_read.close()
  file_write.close()
  duration = 1000  # millisecond
  freq = 440  # Hz
  winsound.Beep(freq, duration)
    def train(self):
        '''
        
        Trains a bow vectorizer

        '''

        processors = dict()
        data = get_speech_text(folder=self.folder)
        text = chain.from_iterable(data.values())

        # stemming
        if 'stemming' in self.steps:
            print "Stemming"
            processors['stemmer'] = Stemmer('german')
            text = map(
                lambda y: ' '.join(processors['stemmer'].stemWords(y.split(' ')
                                                                   )), text)

        # the count vectorizer of scikit learn
        if 'hashing' in self.steps:
            print 'Hashing Bag of words vectorizer'
            count_vect = HashingVectorizer(ngram_range=(1, 5),
                                           decode_error='ignore').fit(text)
        elif 'trigrams' in self.steps:
            print "Trigram Bag-of-Words"
            count_vect = CountVectorizer(ngram_range=(1, 3),
                                         min_df=2).fit(text)
        elif 'bigrams' in self.steps:
            print "Bigram Bag-of-Words"
            count_vect = CountVectorizer(ngram_range=(1, 2),
                                         min_df=2).fit(text)
        else:
            print "Unigram Bag-of-Words"
            count_vect = CountVectorizer(min_df=2).fit(text)

        processors['count_vectorizer'] = count_vect
        text = count_vect.transform(chain.from_iterable(data.values()))

        if 'tfidf' in self.steps:
            print "Tf-idf normalization"
            processors['tf_transformer'] = TfidfTransformer(
                use_idf=True).fit(text)
            text = processors['tf_transformer'].transform(text)

        # dumping this vectorizer to pickle
        fn = self.folder + '/vectorizer_%s.pickle' % '_'.join(
            sorted(self.steps))
        cPickle.dump(processors, open(fn, 'wb'), -1)

        self.processors = processors
        fn = self.folder + '/bag_of_words_%s.pickle' % '_'.join(
            sorted(self.steps))
        for party in data.keys():
            data[party] = self.transform(data[party])
        cPickle.dump(data, open(fn, 'wb'), -1)
예제 #13
0
def processQueries(queries):
    queryList = []
    for query in queries:
        filteredQuery = tokenize.filterToken(query, tokenize.getStopWords())
        if filteredQuery and filteredQuery is not None:
            stemmer = Stemmer('english')
            queryStem = stemmer.stemWord(filteredQuery.lower())
            queryList.append(queryStem)

    return queryList
예제 #14
0
def process_text(s):
    s = re.sub('<[^>]+>', '', s)
    s = re.sub('&.*?;', '', s)
    words = simple_preprocess(s, deacc=True, max_len=99)
    words = [word for word in words if word not in stoplist]
    stemmer = Stemmer('english')
    words = stemmer.stemWords(words)
    #print words
    #print stoplist
    #raw_input()
    return words
예제 #15
0
def get_stemmer(lang, allow_dummy=True):
    global _stemmers
    if lang:
        lang = locale_to_lang(lang)
    if lang not in known_languages:
        if not allow_dummy:
            return None
        _stemmers[lang] = DummyStemmer()
    elif lang not in _stemmers:
        _stemmers[lang] = Stemmer(lang)
    return _stemmers[lang]
예제 #16
0
파일: single.py 프로젝트: PaulHuygen/xtas
def stem_snowball(doc, language):
    """Stem words in doc using the Snowball stemmer.

    Set the parameter ``lang`` to a language code such as "de", "en", "nl", or
    the special string "porter" to get Porter's classic stemming algorithm for
    English.

    See also
    --------
    morphy: smarter approach to stemming (lemmatization), but only for English.
    """
    from Stemmer import Stemmer
    return Stemmer(language).stemWords(_tokenize_if_needed(doc))
예제 #17
0
    def parse_html(html):
        words = dehtml(html)

        s = Stemmer("danish")

        result = []
        for w in words.split():
            word = w.lower()
            if word in stop_words or len(word) < 2 or word.count('\\'):
                continue

            result.append(s.stemWord(word))
        return result
예제 #18
0
파일: nlp.py 프로젝트: artemvang/tldry
    def __init__(self, language, min_sent_len):
        if language not in AVAILABLE_LANGUAGES:
            err = (f"Language '{language}' is not available, "
                   f"choose from [{', '.join(AVAILABLE_LANGUAGES)}]")
            raise ValueError(err)

        self.min_sent_len = min_sent_len

        self.stem = Stemmer(language).stemWord

        stopwords = importlib.import_module(f'tldry.stopwords.{language}')
        self.stopwords = frozenset(self.stem(w) for w in stopwords.stopwords)

        self.raw_sentences = []
예제 #19
0
def getTerm(term):
    term_ids = {}
    term_ids_file = open(TERMIDSFILE, 'rU')

    for line in term_ids_file.readlines():
        pieces = line.strip().split('\t')
        stemmer = Stemmer('english')
        #stemmer.maxCacheSize = 1
        termStem = stemmer.stemWord(term.lower())
        if termStem == pieces[1]:
            term_ids[pieces[1]] = int(pieces[0])
            return term_ids

    term_ids_file.close()
    return term_ids
예제 #20
0
파일: single.py 프로젝트: toabey/xtas
def stem_snowball(doc, language):
    """Stem words in doc using the Snowball stemmer.

    Set the parameter ``lang`` to a language code such as "de", "en", "nl", or
    the special string "porter" to get Porter's classic stemming algorithm for
    English.

    See also
    --------
    morphy: smarter approach to stemming (lemmatization), but only for English.
    """
    from Stemmer import Stemmer
    # Build the Stemmer before fetching to force an exception for invalid
    # languages.
    stem = Stemmer(language).stemWords
    return pipe(doc, fetch, _tokenize_if_needed, stem)
예제 #21
0
def simple_tokenizer(text: str, stemmer: Stemmer = None) -> List[List[Tuple]]:
    if stemmer is None:
        stemmer = Stemmer('english')
    text = unicodedata.normalize(UNICODE_NORMALIZATION, text)
    text = normalize_quotation_marks(text)
    paragraphs = split_into_paragraphs(text)

    stopwords = get_stopwords()
    common_abbr = get_common_abbr()
    sentences = split_into_sentences(paragraphs, common_abbr)

    tagged_sentences = tag_sentences(sentences, stopwords, common_abbr)

    tagged_and_stemmed = apply_snowball_stemmer(tagged_sentences, stemmer)

    return tagged_and_stemmed
예제 #22
0
    def __init__(self, rules: List[ExtendedRule], tokenizer_fn=tokenize):
        self.rules = rules
        self.intents = []
        self.resolvers = [
            StemmerResolver(Stemmer('russian'))
        ]
        self.j2_env = NativeEnvironment()
        self.tokenizer_fn = tokenizer_fn

        for r in rules:
            if r.production.startswith('intent'):
                self.intents.append(r.production)

        self.intents = tuple(set(self.intents))

        self._parsing_tables = {}
    def __init__(self, inputPath, outputPath):

        self.inputPath = inputPath
        self.outputPath = outputPath

        logging.setLoggerClass(ColoredLogger)
        self.logger = logging.getLogger('ArnetMinerDataImporter')

        # Get the stop words set & stemmer for text analysis
        self.stopWords = None
        with open(
                os.path.join(os.getcwd(), 'src', 'importer',
                             'stopWords.json')) as stopWordsFile:
            self.stopWords = set(json.load(stopWordsFile))
        self.stemmer = Stemmer('english')

        super(ArnetMinerDataImporter, self).__init__()
예제 #24
0
def summarize(text_file: str) -> Dict:
    en_stemmer = Stemmer('english')
    with open(text_file, 'r', encoding='utf-8') as file:
        sentences = simple_tokenizer(file.read(), en_stemmer)

    scored_terms = score_terms(sentences)
    scored_sentences = score_sentences(scored_terms, sentences)

    top_keywords = get_top_keywords(
        scored_terms)  # TODO those are underlying reprs!
    reduced_text = reduce_sentences(scored_sentences)

    return {
        'top keywords': top_keywords,
        'original text': sentences,
        'reduced text': reduced_text,
        'reduced by': (1 - (len(reduced_text) / float(len(sentences))))
    }
    def __init__(self, inputFolderPath, outputPath):

        self.inputFolderPath = inputFolderPath
        self.outputPath = outputPath

        logging.setLoggerClass(ColoredLogger)
        self.logger = logging.getLogger('SerializedDBLPDataImporter')

        # Get the stop words set & stemmer for text analysis
        self.stopWords = None
        with open(os.path.join(os.getcwd(), 'src', 'importer', 'stopWords.json')) as stopWordsFile:
            self.stopWords = set(json.load(stopWordsFile))
        self.stemmer = Stemmer('english')

        # Regex for stripping non-visible characters
        controlChars = ''.join(map(unichr, range(0,32) + range(127,160)))
        self.controlCharactersRegex = re.compile('[%s]' % re.escape(controlChars))

        super(SerializedDBLPDataImporter, self).__init__()
def stemming(tokens):
    """
        Input = Tokens after tokenisation and removing stop words
        Function = Use stemmer to identify the root word
    """

    newlist = []

    for s in tokens:
        if s in stemmed_dict:
            str = stemmed_dict[s]
        else:
            str = Stemmer('english').stemWord(s)
            stemmed_dict[s] = str

        #if str not in newlist:
        newlist.append(stemmed_dict[s])

    return newlist
예제 #27
0
    def __init__(self, words_file=default_words, verbs_file=default_verbs, joined_verb_parts=True):
        self.verbs = {}
        self.stemmer = Stemmer()

        tokenizer = WordTokenizer(words_file=default_words, verbs_file=verbs_file)
        self.words = tokenizer.words

        if verbs_file:
            self.verbs['است'] = '#است'
            for verb in tokenizer.verbs:
                for tense in self.conjugations(verb):
                    self.verbs[tense] = verb
            if joined_verb_parts:
                for verb in tokenizer.verbs:
                    bon = verb.split('#')[0]
                    for after_verb in tokenizer.after_verbs:
                        self.verbs[bon + 'ه_' + after_verb] = verb
                        self.verbs['ن' + bon + 'ه_' + after_verb] = verb
                    for before_verb in tokenizer.before_verbs:
                        self.verbs[before_verb + '_' + bon] = verb
예제 #28
0
    def __init__(self):

        self.NaiveBayesClassifier = NaiveBayesClassifier()

        # Sentence Splitters
        self.RuleBasedSentenceSplitter = RuleBasedSentenceSplitter()
        self.MLBasedSentenceSplitter = MLBasedSentenceSplitter()

        # Tokenizers
        self.RuleBasedTokenizer = RuleBasedTokenizer()
        self.MLBasedTokenizer = MLBasedTokenizer()

        # Normalizer
        self.Normalizer = Normalizer()

        # Stemmer
        self.Stemmer = Stemmer()

        # Stopword Eliminators
        self.StaticStopWordEliminator = StaticStopwordRemover()
        self.DynamicStopWordEliminator = DynamicStopWordEliminator()
예제 #29
0
파일: index.py 프로젝트: do3cc/Scanned-Docs
def index(text, accepted_languages=None, langs=None):
    registry = get_current_registry()
    if accepted_languages == None:
        accepted_languages = [
            x.strip()
            for x in registry.settings["accepted_languages"].split(",")
        ]
    if langs == None:
        lang = guessLanguage(text)
        if lang not in accepted_languages:
            langs = accepted_languages
        else:
            langs = [lang]
    langs = list(set(langs).intersection(set(accepted_languages)))
    if not langs:
        langs = accepted_languages
    indexed_words = set()
    for lang in langs:
        stemmer = Stemmer(lang)
        indexed_words.update(
            [stemmer.stemWord(x.value) for x in tokenize(text)])
    return indexed_words
예제 #30
0
def nonField_query(path, text, secondary_index_list):
    #print(1)
    text = text.lower()
    text = text.encode('ascii', errors='ignore').decode()
    text = re.sub(r'[^A-Za-z0-9]+', r' ', text)
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filter_sentence = [w for w in word_tokens if not w in stop_words]
    # filter_sentence = []
    # for w in word_tokens:
    #     if w not in stop_words:
    #         filter_sentence.append(w)
    stemmer = Stemmer('porter')
    stem_text = []
    for word in filter_sentence:
        stem_text.append(stemmer.stemWord(word))
    #print(word)
    result_list = []
    #print(stem_text)
    for word in stem_text:
        result_list.append(Posting(secondary_index_list, word, path))
    return result_list