Beispiel #1
0
def preprocess(data, stem_data, remove_stopwords):
    processed = []
    stemmer = PorterStemmer()
    for file in data:

        # lowercasing all text

        file = str(file).lower()

        # removing non-alpha characters
        file = re.sub('[^a-zA-Z]', ' ', file)

        # tokenizing articles
        tokenized = word_tokenize(file)

        # removing stop words from tokens
        stop_removed_tokens = []
        if remove_stopwords:
            for word in tokenized:
                if word not in stop_words:
                    stop_removed_tokens.append(word)
        else:
            stop_removed_tokens = tokenized
        if stem_data:
            stemmed = []
            for token in stop_removed_tokens:
                stemmed.append(stemmer.stem(token))
            processed.append(stemmed)
        else:
            processed.append(stop_removed_tokens)
    return processed
Beispiel #2
0
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`.
    """
    text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split())
def get_top_labels(country_scores):
    """Output: Dictionary --> key = country, value = list of top labels"""
    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents(
        [str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by="tfidf", ascending=False)
    country_scores_pmi = country_scores.sort_values(by="pmi", ascending=False)
    top_labels = [[] for x in range(country_scores['num_countries'][0])]
    top_labels_pmi = [[]
                      for x in range(country_scores_pmi['num_countries'][0])]

    used_stems = set()
    used_stems_pmi = set()

    for row in country_scores.itertuples():
        if row.stem not in used_stems:
            if len(top_labels[row.country]) < 40:
                top_labels[row.country].extend([
                    row.label.lower().replace('_', ' ').strip(), row.tfidf,
                    row.pmi, row.country
                ])
                used_stems.add(row.stem)

    for row in country_scores_pmi.itertuples():
        if row.stem not in used_stems_pmi:
            if len(top_labels_pmi[row.country]) < 40:
                top_labels_pmi[row.country].extend([
                    row.label.lower().replace('_', ' ').strip(), row.tfidf,
                    row.pmi, row.country
                ])
                used_stems_pmi.add(row.stem)

    return top_labels, top_labels_pmi
Beispiel #4
0
def assign_country_label_ids(country_scores, label_score, num_candidates, use_label_candidates):
    """Output: Dictionary --> key = country, value = label"""

    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by=label_score, ascending=False)
    used_stems = set()

    if use_label_candidates is True:
        # print('USING SOFT LABELING')
        final_labels = defaultdict(set)
        final_ids = defaultdict(set)

        for row in country_scores.itertuples():
            if len(final_labels[row.country]) <= num_candidates and row.stem not in used_stems and row.stem not in BLACK_LIST:
                final_labels[row.country].add([row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi])
                final_ids[row.country].add(int(row.label_id))
                used_stems.add(row.stem)
    else:

        final_labels = {}
        final_ids = {}

        for row in country_scores.itertuples():
            if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST:
                final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]
                final_ids[row.country] = row.label_id
                used_stems.add(row.stem)
    return final_labels, final_ids
Beispiel #5
0
def process(text):
    p = PorterStemmer()
    doc_nor = text.lower()
    doc_sw = remove_stopwords(doc_nor)
    doc_stem = p.stem_sentence(doc_sw)
    stem = re.findall(r'[\w]+', doc_stem)
    return doc_stem.split()
Beispiel #6
0
def preprocess(text):
    #convert text to lower case
    text = text.lower()

    #removing whitespace
    text.strip()

    #removing digits
    text = gensim.parsing.preprocessing.strip_numeric(text)
    #text = ' '.join(s for s in text.split() if not any(c.isdigit() for c in s))

    #print(text)

    #remove stopwords
    text = gensim.parsing.preprocessing.remove_stopwords(text)

    #strip punctutation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)

    #strip multiple whitepsace that might occur after we remove stopwords
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)

    p = PorterStemmer()

    text = ' '.join(p.stem(word) for word in text.split())

    #print(text)

    return text
Beispiel #7
0
def dataToXYListRead(fileName):
    with open(fileName) as file:
        porter_stemmer = PorterStemmer()
        lineCount = 0
        wordSentenceDbLi = []
        while True:
            line = file.readlines(1)
            if not line:
                break
            # if lineCount == 20:
            #     break
            jsonLine = json.loads(line[0])

            # noStopWords = remove_stopwords(jsonLine['text'])
            # stemWords = porter_stemmer.stem(noStopWords)
            stemWords = porter_stemmer.stem(jsonLine['text'])
            tokenWords = simple_preprocess(stemWords, deacc=True)

            # print(tokenWords)
            wordSentenceDbLi.append(tokenWords)
            lineCount += 1
        # yelpDic = corpora.Dictionary(wordSentenceDbLi)
        # yelpDic.save('yelpDictionary.dict')
        # print(yelpDic.token2id)
        # print(yelpDic[8])

        return wordSentenceDbLi
Beispiel #8
0
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`.
    """
    p = PorterStemmer()
    return ' '.join(
        p.stem(word) for word in
        text.lower().split())  # lowercasing required by the stemmer
Beispiel #9
0
def document_preprocess(text):
    p = PorterStemmer()
    first = text.encode('ascii', 'ignore').decode('utf-8').lower()
    second = preprocessing.remove_stopwords(first)
    third = preprocessing.strip_punctuation(second)
    fourth = preprocessing.strip_short(preprocessing.strip_numeric(third))
    fifth = p.stem(fourth)
    return fifth
Beispiel #10
0
 def open_spider(self, spider):
     # Create an empty model
     w2v = gensim.models.Word2Vec([['seo']], min_count=1)
     self.name = '/tmp/Word2Vec' + str(time.time())
     # Save it
     w2v.save(self.name)
     self.p = PorterStemmer()
     self.stop_words = set(stopwords.words('french'))
Beispiel #11
0
def token_stem(text):

    tokens = simple_preprocess(text, deacc=True)

    porter_stemmer = PorterStemmer()
    stem_tokens = [porter_stemmer.stem(word) for word in tokens]

    return stem_tokens
Beispiel #12
0
def preprocess_data(train_data, test_data):
    custom_stopwords = set(ENGLISH_STOP_WORDS)
    custom_stopwords.update(["say", "says", "said", "saying", "just", "year", "man", "men", "woman", \
     "women", "guy", "guys", "run", "running", "ran", "run", "do", "don't", "does", "doesn't" , \
     "doing", "did", "didn't",  "use", "used", "continue", "number", "great", "big", "good", "bad", \
     "better", "worse", "best", "worst", "actually", "fact", "way", "tell", "told", "include", "including", \
     "want", "wanting", "will", "won't", "give", "given", "month", "day", "place", "area", "look", \
     "looked", "far", "near", "get", "getting", "got", "know", "knows", "knew", "long", "week", "have", \
     "has", "haven't", "hasn't", "having", "had", "hadn't", "not", "think", "thinking", "Monday", \
     "Tuesday", "Wednesday", "Thursday", "Saturday", "Sunday", "high", "low", "thing", "there", "they're", \
     "It", "I've", "I'd", "He's", "She's", "They've", "I'm", "You're", "your", "their", "his", "hers", \
     "mine", "today", "yesterday", "it", "ve", "going", "go", "went", "lot", "don", "saw", "seen", "come", "came"])

    titled_train_data = add_titles(train_data['Content'], train_data['Title'])
    if test_data is not None:
        titled_test_data = add_titles(test_data['Content'], test_data['Title'])

    # Removing stopwords:
    new_train_data = []
    for doc in titled_train_data:
        doc_wordlist = doc.split()
        new_doc_wordlist = [
            word for word in doc_wordlist if word not in custom_stopwords
        ]
        new_doc = ' '.join(new_doc_wordlist)
        new_train_data.append(new_doc)
    if test_data is not None:
        new_test_data = []
        for doc in titled_test_data:
            doc_wordlist = doc.split()
            new_doc_wordlist = [
                word for word in doc_wordlist if word not in custom_stopwords
            ]
            new_doc = ' '.join(new_doc_wordlist)
            new_test_data.append(new_doc)

    p = PorterStemmer()
    train_docs = p.stem_documents(new_train_data)
    if test_data is not None:
        test_docs = p.stem_documents(new_test_data)
    print "my_method: Stemmed data."

    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(train_docs)
    if test_data is not None:
        Test = vectorizer.transform(test_docs)
    else:
        Test = None
    print "my_method: Vectorized data"

    svd_model = TruncatedSVD(n_components=200)  # random_state=13
    X = svd_model.fit_transform(X)
    if test_data is not None:
        Test = svd_model.transform(Test)
    print "SVD'd data"

    return X, Test
Beispiel #13
0
 def find_documents(self, term, stemming=False):
     stemmer = PorterStemmer()
     if stemming:
         term = stemmer.stem(term)
     term_id = self.get_id_for_term(term)
     if term_id < 0:
         return set()
     docs = self.get_related_documents(term_id)
     return set(docs)
Beispiel #14
0
 def cleanText(self, textToClean):
     textLower = str(textToClean).lower()
     englishText = "".join(
         [char for char in textLower if char in string.printable])
     textNoPunc = "".join(
         [char for char in englishText if char not in string.punctuation])
     textStop = remove_stopwords(textNoPunc)
     porter = PorterStemmer()
     textStemmed = porter.stem(textStop)
     return (textStemmed.split())
Beispiel #15
0
 def __iter__(self):
     p = PorterStemmer()
     for entry in scandir("./dblpfiledir"):
         with open(entry.path, "r", encoding="utf-8") as f:
             jsoncontent = json.load(f)
             doc = jsoncontent["abstract"]
             if len(doc) > 0:
                 doc = remove_stopwords(doc)
                 doc = p.stem_sentence(doc)
                 words = simple_preprocess(doc, deacc=True)
                 yield TaggedDocument(words=words,
                                      tags=[jsoncontent['index']])
Beispiel #16
0
def preprocess_documents(documents):
    # preprocess each doc
    documents = [preprocess_doc(doc) for doc in documents]

    # stem the documents
    stemmer = PorterStemmer()
    documents = stemmer.stem_documents(documents)

    # split all the documents into list of tokens
    documents = [doc.split() for doc in documents]

    return documents
Beispiel #17
0
 def __iter__(self):
     p = PorterStemmer()
     for index, row in self.train_data.iterrows():
         name = row['ScriptLink']
         with open('./movie_scripts/' + name) as file:
             #print("Im here")
             script = file.readlines()
             script = "".join(script)
             script = remove_stopwords(script)
             script = p.stem_sentence(script)
             words = simple_preprocess(script)
             yield TaggedDocument(words=words, tags=[index])
Beispiel #18
0
def load_data(tweets_tsv, tweets_postag):
    """
  Return tweets id,user id,tweets label,raw tweets,tokenized tweets,
  tweets in PoS, PoS tagged tweets and stemmed tweets in a pandas Dataframe.
  
  :param tweets_tsv: <SID><tab><UID><tab><CLASS><tab><TWITTER_MESSAGE>
  :parm tweets_postag: ark-TweetNLP `./runTagger.sh --output-format conll --input-formt txt --input-field 4` 
  :rtype: pandas.DataFrame
  """

    o = open(tweets_tsv, 'r', encoding='utf-8').readlines()
    p = open(tweets_postag).read()

    raw = p.split('\n\n')
    raw_pos_data = [line.split('\n') for line in raw]
    pos_data = []
    for tweet in raw_pos_data:
        pos_data.append([tuple(word_pos.split('\t')) for word_pos in tweet])

    stemmer = PorterStemmer()

    data = {}
    for idx, line in enumerate(o):
        tweet_id, user_id, adr, text = line.split('\t')
        data[tweet_id] = {}
        data[tweet_id]['user_id'] = user_id
        data[tweet_id]['adr'] = adr
        data[tweet_id]['raw_text'] = text
        data[tweet_id]['stem_text'] = [
            stemmer.stem(w_pos[0]) for w_pos in pos_data[idx]
        ]
        data[tweet_id]['tok_text'] = [w_pos[0] for w_pos in pos_data[idx]]
        data[tweet_id]['pos_token'] = [w_pos[1] for w_pos in pos_data[idx]]
        data[tweet_id]['pos_text'] = [
            '#'.join(list(w_pos)) for w_pos in pos_data[idx]
        ]

    df = pd.DataFrame.from_dict(data, orient='index')

    df.adr = df.adr.astype('int')
    df.user_id = df.user_id.astype('int')

    logger.info("Loaded dataframe from {0} and {1}".format(
        tweets_tsv, tweets_postag))
    logger.info("Dataframe information:\n")
    df.info()

    return df
Beispiel #19
0
def main():
    ############################## Setup Code #####################################
    global document_index
    path = "./myroot"
    file3 = open("cmptext.txt", "w+")
    number_of_documents = recursive_read(path, file3)
    file3.close()
    print 'All files read'
    file3 = open("cmptext.txt", "r")
    preprocess(file3, number_of_documents)
    file3.close()
    print 'All files processed'
    print 'Word2Vec begins'
    model = get_word2vec(number_of_documents)  #includes trigrams
    model.save('vocab.txt')
    print 'Word2Vec done'
    vocabulary = model.wv.vocab.keys()
    inverted_index = get_inverted_index(vocabulary)
    for item in inverted_index.keys():
        if not inverted_index[item]:
            del inverted_index[item]
    with open("inverted-index.txt", "wb") as fp:
        pickle.dump(inverted_index, fp)
    fp.close()
    get_tfidf_vectors(inverted_index, number_of_documents)
    get_norms()
    doc_num = 0
    file1 = open("cmptext.txt", "r")
    stemmer = PorterStemmer()
    for document in file1:
        spreprocessed = []
        doc_num += 1
        for line in document.split('. '):
            temp1 = []
            temp2 = []
            temp1 = gensim.utils.simple_preprocess(line, max_len=20)
            for word in temp1:
                if word not in stop_words:
                    temp2.append(word)
            spreprocessed.append(stemmer.stem_documents(temp2))
        with open("spreprocessed" + str(doc_num) + ".txt", "w+") as fp:
            pickle.dump(spreprocessed, fp)
        fp.close()
        del spreprocessed[:]
    file1.close()
    with open("document-index.txt", "wb") as fp:
        pickle.dump(document_index, fp)
    fp.close()
Beispiel #20
0
class Word2VecPipeline(object):
    def open_spider(self, spider):
        # Create an empty model
        w2v = gensim.models.Word2Vec([['seo']], min_count=1)
        self.name = '/tmp/Word2Vec' + str(time.time())
        # Save it
        w2v.save(self.name)
        self.p = PorterStemmer()
        self.stop_words = set(stopwords.words('french'))

    def process_item(self, item, spider):
        if 'title' in item:
            # This time, we don't update the item, instead we build the model.
            document = item.get('title') + ' ' + item.get('body')
            words = [
                word_tokenize(self.p.stem_sentence(s))
                for s in sent_tokenize(document)
            ]
            # Load current model
            w2v = gensim.models.Word2Vec.load(self.name)
            # Train our model
            w2v.build_vocab(words, update=True)
            w2v.train(words, total_examples=w2v.corpus_count, epochs=w2v.iter)
            # Save it for the next item
            w2v.save(self.name)
        return item
Beispiel #21
0
    def __init__(self,
                 docs: List[str],
                 index_path: str,
                 root: str = "lyrics/") -> None:
        """Initialize Indexer by assigning attributes and opening index file.

        Args:
            docs: List of documents filenames.
            index_path: Path to index file.
            root: Directory where songs lyrics is.

        """
        self.root = root
        self.docs = docs
        self.stemmer = PorterStemmer()
        self.get_word_count()
        self.index = shelve.open(index_path)
Beispiel #22
0
 def __init__(self):
     self.morph = {'ru': MorphAnalyzer(), 'en': PorterStemmer()}
     self.other_significance = 10
     self.stopwords = dict()
     with open('../thirdparty/stop_ru.json', 'r', encoding='utf-8') as f:
         self.stopwords['ru'] = json.load(f)
     with open('../thirdparty/stop_en.json', 'r', encoding='utf-8') as f:
         self.stopwords['en'] = json.load(f)
Beispiel #23
0
def assign_country_label_ids(country_scores, label_score):
    """Output: Dictionary --> key = country, value = label"""

    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by=label_score, ascending=False)
    used_stems = set()

    final_labels = {}
    final_ids = {}

    for row in country_scores.itertuples():
        if row.country not in final_labels and row.stem not in used_stems and row.stem not in BLACK_LIST:
            final_labels[row.country] = [row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi]
            final_ids[row.country] = row.label_id
            used_stems.add(row.stem)
    return final_labels, final_ids
Beispiel #24
0
def get_top_labels(country_scores, label_score, num_candidates=5):
    """Output: Dictionary --> key = country, value = list of top labels"""

    ps = PorterStemmer()
    country_scores['stem'] = ps.stem_documents([str(word) for word in country_scores['label']])
    country_scores = country_scores.sort_values(by=label_score, ascending=False)
    num_labels_per_country = defaultdict(int)
    top_labels = []
    used_stems = set()

    for row in country_scores.itertuples():
        if row.stem not in used_stems:
            if num_labels_per_country[row.country] < num_candidates:
                top_labels.append([row.country, row.label_id, row.label.lower().replace('_', ' ').strip(), row.tfidf, row.pmi])
                used_stems.add(row.stem)
                num_labels_per_country[row.country] += 1

    return top_labels
Beispiel #25
0
def preprocess(file_name, number_of_documents):
    stemmer = PorterStemmer()
    fp1 = open("preprocessed.txt", "wb")
    fp2 = open("preprocessed-cmptext.txt", "wb")
    pickle.dump(number_of_documents, fp1)
    for line in file_name:
        preprocess_list1 = gensim.utils.simple_preprocess(line, max_len=20)
        preprocess_list2 = []
        for word in preprocess_list1:
            if word not in stop_words:
                preprocess_list2.append(word)
        pickle.dump(stemmer.stem_documents(preprocess_list2), fp1)
        for word in preprocess_list2:
            fp2.write(stemmer.stem(word.encode('utf-8')))
            fp2.write(' ')
        fp2.write('\n')
    fp1.close()
    fp2.close()
def spimi_invert(
    files: List[str],
    stemmer: PorterStemmer,
    blocks_dir: str,
    memory_available: int,
) -> List[str]:
    """SPIMI-Invert procedure.

    Collect terms, docIDs, term-frequencies into a block (dictionary
    of dictionaries) that fits in available memory, write each block's
    dictionary to disk, and start a new dictionary for the next block.

    Args:
        files: List of filepaths.
        stemmer: Gensim porter stemmer.
        blocks_dir: Directory where blocks are saved.
        memory_available: Available memory in bytes.

    Returns:
        List of filenames of saved blocks.

    """
    memory_used = 0
    outputed_blocks = []
    block_index = 0
    dictionary = {}
    for docId, token in token_stream(files):
        memory_used += sys.getsizeof(token)

        term = stemmer.stem(token)
        if term not in dictionary.keys():
            dictionary[term] = {}
        if docId not in dictionary[term].keys():
            dictionary[term][docId] = 0
        dictionary[term][docId] += 1  # save term freq. in document

        if memory_used > memory_available:
            # Sort terms and write to disk
            with shelve.open(blocks_dir + "block" + str(block_index)) as f:
                for k in sorted(dictionary.keys()):
                    f[k] = dictionary[k]
            outputed_blocks.append("block" + str(block_index))
            block_index += 1
            memory_used = 0
            dictionary = {}

    # Save last block
    if dictionary:
        with shelve.open(blocks_dir + "block" + str(block_index)) as f:
            for k in sorted(dictionary.keys()):
                f[k] = dictionary[k]
        outputed_blocks.append("block" + str(block_index))
    return outputed_blocks
Beispiel #27
0
def get_task_topic_dist(ldah, task):
    #clean, stem and tokenize the prompt/task string
    task = clean_text(task)
    task = PorterStemmer().stem_sentence(task)
    tokens = word_tokenize(task)
    tokens = [word for word in tokens if not word in stop_words]

    #compute topic distribution and sort
    dist = get_topic_dist(ldah, tokens)
    dist.sort()

    return dist
Beispiel #28
0
def processing(body_text):
    p = PorterStemmer()
    stopset = set([
        'doi', 'preprint', 'copyright', 'org', 'https', 'et', 'al', 'author',
        'figure', 'table', 'rights', 'reserved', 'permission', 'use', 'used',
        'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.',
        'Elsevier', 'PMC', 'CZI', '-PRON-', 'usually', r'\usepackage{amsbsy',
        r'\usepackage{amsfonts', r'\usepackage{mathrsfs',
        r'\usepackage{amssymb', r'\usepackage{wasysym',
        r'\setlength{\oddsidemargin}{-69pt', r'\usepackage{upgreek',
        r'\documentclass[12pt]{minimal'
    ])
    cStopwords = STOPWORDS.union(stopset)
    resultlist = []
    for text in body_text:
        tokens = []
        for item in gensim.parsing.preprocess_string(text):
            if item not in cStopwords:
                p.stem(item)
                tokens.append(item)
        yield model.infer_vector(tokens)
Beispiel #29
0
    def __init__(self):
        # info
        self.cluster_info = dict()
        self.article_info = dict()

        if config_meta['word_tokenizer'] == 'bert':
            self.word_tokenize = config.bert_tokenizer.tokenize
        elif config_meta['word_tokenizer'] == 'nltk':
            self.word_tokenize = nltk.tokenize.word_tokenize
        else:
            raise ValueError('Invalid word_tokenizer: {}'.format(
                config_meta['word_tokenizer']))

        self.sent_tokenize = nltk.tokenize.sent_tokenize
        self.porter_stemmer = PorterStemmer()

        if config_meta['texttiling']:
            self.para_tokenize = TextTilingTokenizer()

        # base pat
        BASE_PAT = '(?<=<{0}> )[\s\S]*?(?= </{0}>)'
        BASE_PAT_WITH_NEW_LINE = '(?<=<{0}>\n)[\s\S]*?(?=\n</{0}>)'
        BASE_PAT_WITH_RIGHT_NEW_LINE = '(?<=<{0}>)[\s\S]*?(?=\n</{0}>)'

        # query pat
        self.id_pat = re.compile(BASE_PAT.format('num'))
        self.title_pat = re.compile(BASE_PAT.format('title'))
        self.narr_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('narr'))

        # article pat
        self.text_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('TEXT'))
        self.graphic_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('GRAPHIC'))
        self.type_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('TYPE'))
        self.para_pat = re.compile(BASE_PAT_WITH_NEW_LINE.format('P'))

        self.proc_params_for_questions = {
            'rm_dialog': False,
            'rm_stop': False,
            'stem': True,
        }
Beispiel #30
0
def stem_text(text):
    """Transform `s` into lowercase and stem it.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        Unicode lowercased and porter-stemmed version of string `text`.

    Examples
    --------
    >>> from gensim.parsing.preprocessing import stem_text
    >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.")
    u'while it is quit us to be abl to search a larg collect of document almost instantly.'

    """
    text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split())
Beispiel #31
0
def stem_text(text):
    """Transform `s` into lowercase and stem it.

    Parameters
    ----------
    text : str

    Returns
    -------
    str
        Unicode lowercased and porter-stemmed version of string `text`.

    Examples
    --------
    >>> from gensim.parsing.preprocessing import stem_text
    >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.")
    u'while it is quit us to be abl to search a larg collect of document almost instantly.'

    """
    #text = utils.to_unicode(text)
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.split())
Beispiel #32
0
def stem_text(text):
    """
    Return lowercase and (porter-)stemmed version of string `text`. 
    """
    p = PorterStemmer()
    return ' '.join(p.stem(word) for word in text.lower().split()) # lowercasing required by the stemmer