def __iter__(self): #for line in open('ebola-raw.txt'): for line in open('testdata01.txt'): #for line in open('twitter2Mb.txt'): line = re.sub('<[^>]+>', '', line) utils.lemmatize(line) # assume there's one document per line, tokens separated by whitespace yield dictionary.doc2bow(line.lower().split())
def lemmatize_an_idea(idea, use_stoplist=True): if idea in lemma_dict: return lemma_dict[idea] if use_stoplist: lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3] not in stoplist] else: lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3]] lemma_dict[idea] = lemm return lemm
def lemmatize_an_idea(idea, use_stoplist=True): if idea in lemma_dict: return lemma_dict[idea] if use_stoplist: lemm = [ lem[:-3] for lem in lemmatize(idea) if lem[:-3] not in stoplist ] else: lemm = [lem[:-3] for lem in lemmatize(idea) if lem[:-3]] lemma_dict[idea] = lemm return lemm
def process_article(args): """ Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). """ text, lemmatize, title, pageid = args categories = get_categories(text) if not list(set(categories).intersection(input_categories)): return None, None, None, None text = filter_wiki(text) sentences = sentence_tokenize(text) title = title.replace(' ', '_') paragraphs = {} # Split document into paragraphs # sentences = [s0, s1, t0, s2, t1, ...] paragraph_title = [title] level = 1 this_sentences = [] for sent in sentences: # Sent is a paragraph title if sent[:1] == '=': pt = '/'.join(paragraph_title) pt = pt.replace(',', '') paragraphs[pt] = this_sentences this_sentences = [] # Level of paragraph level = max(len(s) for s in re.findall(r'=+', sent)) this_title = sent[level:len(sent)-level].strip().replace(' ', '_') if level > len(paragraph_title): paragraph_title.append(this_title) elif level < len(paragraph_title): for i in range(len(paragraph_title)-level): paragraph_title.pop() paragraph_title[level-1] = this_title else: paragraph_title[level-1] = this_title else: this_sentences.append(sent) pt = '/'.join(paragraph_title) pt = pt.replace(',', '') paragraphs[pt] = this_sentences if lemmatize: result = {k: [utils.lemmatize(s) for s in v if len(utils.lemmatize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0} else: result = {k: [word_tokenize(s) for s in v if len(word_tokenize(s)) >= 2] for k, v in paragraphs.items() if len(v) >= 0} return categories, result, title, pageid
def lemmatizeCorpus(document, isListOfDocs=False): if isListOfDocs: docs = [] for doc in document: _lemmitizedTokens = lemmatize(doc) docs.append([ token.decode("utf-8").split("/")[0] for token in _lemmitizedTokens ]) return docs else: _lemmitizedTokens = lemmatize(document) return [ token.decode("utf-8").split("/")[0] for token in _lemmitizedTokens ]
def __init__(self, searchPhrase, dbname='TwitterDB', host='localhost', port=27017, query=None, k=0): self.queries = Queries(dbname=dbname, host=host, port=port) self.words = [ word.split('/')[0] for word in lemmatize( cleanText.removeStopWords( cleanText.cleanText(searchPhrase)[0])) ] self.idfs = dict() and_list = [] if self.words: for word in self.words: and_list.append({'words.word': word}) self.query_search = {"$and": and_list} if query: self.existing = True self.query_search.update(query) else: self.existing = False self.k = k
def proc_ent(ent): ent = ent.lower().replace('.', ' ').replace('-', ' ').strip().replace( '_', ' ').replace('|', ' ').strip() ent = ' '.join( [tok.decode('utf-8').split('/')[0] for tok in lemmatize(ent)]) # ent = ' '.join(list( set(ent.split()) - set(config.stpwords))) return ent
def preprocess_data(cls): """ It will process te ground data on which we are going to test and return te result. """ preprocessed_description, preprocessed_speciality = [], [] for _, sentence in enumerate(CURO_DATA["Description"].values): # We want those words here which is free from contraction so that not loose meaning sentence = CURO().contraction(str(sentence)) # Eliminate those words which are with numbers sentence = re.sub(r"\S*\d\S*", "", sentence).strip() # Eliminate all numerics and special characters sentence = re.sub('[^A-Za-z]+', " ", sentence) # Remove all stopwords from each sentence, convert to lowercase sentence = " ".join(e.lower() for e in str(sentence).split() if e.lower() \ not in STOPWORDS) # Lemmatize all words sentence = " ".join([word.decode('utf-8').split('/')[0] for word in \ lemmatize(sentence)]) preprocessed_description.append(sentence.strip()) for _, sentence in enumerate(CURO_DATA["Speciality"].values): # Eliminate all numerics and special characters sentence = sentence.replace("@#$", "") if not sentence.split("@#$")[1] \ else sentence.replace("@#$", " => ") # Remove all stopwords from each sentence, convert to lowercase sentence = " ".join(e.lower() for e in str(sentence).split()) preprocessed_speciality.append(sentence.strip()) CURO_DATA["Preprocessed_Description"] = preprocessed_description CURO_DATA["Preprocessed_Speciality"] = preprocessed_speciality
def foodwordReplacedTokenizer(review): """ Epand contractions, lemmatize, and replace food-related words with "foodword". """ # Expand contractions words = [] for word in review.split(): word = word.lower() if word in contractions: word = contractions[word] words += [word] review = ' '.join(words) # Lemmatize from parts of speech tokens = [] for lemma in utils.lemmatize(review): lemma, pos = lemma.split('/') tokens += [lemma] # Re-merge for more processing lemmatized_review = ' '.join(tokens) # Join not with words in front; formatted_lm_review = lemmatized_review.replace(' not ', ' not_') # Food word replacement words = [] for word in formatted_lm_review.split(): if 'noun.food' in [syn.lexname() for syn in wn.synsets(word)]: words += ['FOODWORD'] else: words += [word] return ' '.join(words)
def process_texts(bigram, texts): """ Function to process texts. Following are the steps we take: 1. Stopword Removal. 2. Collocation detection. 3. Lemmatization (not stem since stemming can reduce the interpretability). Parameters: ---------- bigram-- bigram to analyze texts-- Tokenized texts. Returns: ------- texts: Pre-processed tokenized texts. """ # reg. expression tokenizer texts = [[word for word in line if word not in stops] for line in texts] texts = [bigram[line] for line in texts] texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts] return texts
def phrases(self, clean_text): all_lemmas = lemmatize(clean_text, stopwords=self.stopwords) curated_words = [str(word).split('/')[0] for word in all_lemmas] curated_text = ' '.join(curated_words) doc = textacy.Doc(curated_text, lang='en') all_phrases = [] all_phrases += textacy.extract.ngrams(doc, 2, filter_stops=True, filter_punct=True, filter_nums=True) all_phrases += textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, filter_nums=True) all_phrases += textacy.extract.ngrams(doc, 4, filter_stops=True, filter_punct=True, filter_nums=True) all_phrases += textacy.extract.ngrams(doc, 5, filter_stops=True, filter_punct=True, filter_nums=True) phrases = [str(phrase) for phrase in all_phrases] return phrases
def preprocess_text(tweet): """ Function to process an aggregated user profile. This does the following: 1. Decode html entities. eg. "AT&T" will become "AT&T" 2. Deaccent 3. Remove links. 4. Remove any user mentions (@name). 5. Lemmatize and remove stopwords. Parameters: ---------- text : String. If train_texts is a list of tweets, ' '.join and pass Returns: ------- text : preprocessed (tokenized) tweet. """ tweet = decode_htmlentities(tweet) tweet = deaccent(tweet) tweet = tweet.encode('ascii', 'ignore') # To prevent UnicodeDecodeErrors later on tweet = re.sub(r'http\S+', '', str(tweet)) # Step 3 tweet = re.sub(r'@\w+', '', str(tweet)) # Step 4 tweet = tweet.split() tweet = lemmatize(' '.join(tweet), re.compile('(NN)'), stopwords=stopwords.words('english'), min_length=3, max_length=15) tweet = [word.split('/')[0] for word in tweet] return tweet
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Parse a Wikipedia article, extract all tokens. Notes ----- Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages like Japanese or Thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool). Parameters ---------- args : (str, bool, str, int) Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title, page identificator. tokenizer_func : function Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`). Needs to have interface: tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. token_min_len : int Minimal token length. token_max_len : int Maximal token length. lower : bool Convert article text to lower case? Returns ------- (list of str, str, int) List of tokens from article, title and page id. """ text, lemmatize, title, pageid = args text = filter_wiki(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenizer_func(text, token_min_len, token_max_len, lower) return result, title, pageid
def preprocess_text(lemma, document): with open(document, 'r') as infile: # transform document into one string text = ' '.join(line.rstrip('\n') for line in infile) # convert string into unicode text = gensim.utils.any2unicode(text) # remove URL's text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text) # remove symbols excluding the @, # and \s symbol text = re.sub(r'[^\w@#\s]', '', text) if lemma: return utils.lemmatize(text, stopwords=ignore_words, min_length=3) # tokenize words using NLTK Twitter Tokenizer tknzr = TweetTokenizer() text = tknzr.tokenize(text) # lowercase, remove words less than len 2 & remove numbers in tokenized list text = [ word.lower() for word in text if len(word) > 2 and not word.isdigit() ] # remove stopwords return [word for word in text if not word in ignore_words]
def lemmaSentence1(i, curSentence): x = lemmatize(curSentence) x = set([y.decode('utf-8').split('/')[0] for y in x]) x = [str(y).lower() for y in x if len(y) > 2] #print("Completed") print("Completed for i {0}".format(i)) return (TaggedDocument(words=x, tags=[str(i)]))
def extract_user(user): with open('../data/' + user + '/interesting_articles.txt') as stalk_f: articles = filter(lambda x: x != '', stalk_f.read().rstrip('\n').split(' ')) tastes = numpy.array([0.0 for i in range(lda.num_topics)]) total = 0.0 having = 0 not_having = 0 for article in articles: #print article try: text = open('../data/' + article + '.txt').read() having += 1 except IOError: # we don't have this article not_having += 1 continue if LEMMATIZE: a = utils.lemmatize(text) else: print >> sys.stderr, "ERROR: install pattern" sys.exit(-1) for topicid, proba in lda[lda.id2word.doc2bow(a)]: total += proba tastes[topicid] += proba tastes /= total of = open(user+'.params', 'w') pickle.dump(tastes.tolist(), of) print "For user:"******" we had:", having, "and missed:", not_having, "->", having*100.0/(having+not_having+0.000001), "%"
def __init__(self, searchPhrase, dbname='TwitterDB', query=False, k=0): client = pymongo.MongoClient() self.db = client[dbname] self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))] self.listSearch = {} self.query = query self.k = k
def get_texts(self): """ Iterate over the HN articles returning text """ positions, hn_articles = 0, 0 # ************ HN articles ************ fnamelist = [] for g in glob.iglob(self.hn_folder + '/*.txt'): fnamelist.append(g) for fileno, fname in enumerate(fnamelist): hn_text = open(fname).read() hn_articles += 1 if LEMMATIZE: result = utils.lemmatize(hn_text) positions += len(result) yield result else: result = tokenize(hn_text) # text into tokens here positions += len(result) yield result print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions)) self.length = hn_articles # cache corpus length
def _pos_tokenize_document(self, doc): tokens = simple_preprocess(doc) # lemmatizes, POS tags and remove stopwords (including empty strings) from the tokens list for stories pos_tokens = [ lemmatize(t) for t in tokens if t not in STOPWORDS and len(t) > 0 ] # flatten the list-of-lists of POS tokens created by previous operation and return return [word for inner_list in pos_tokens for word in inner_list]
def process_article(args): text, lemmatize, title, pageid = args text = filter_wiki(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenize(text) return result, title, pageid
def __init__(self, document): """ :param document: A string with the content of the document. """ # use pattern lemmatizer. see gensim.utils.lemmatizer. #Note: len(words) < 15 are filtered out self.clean_document_ = utils.lemmatize(document)
def clean(text): text = strip_multiple_whitespaces(strip_non_alphanum(text)).split() words = [] for word in text: tmp = lemmatize(word) if tmp: words.append(tmp[0][:-3].decode("utf-8")) return " ".join(words)
def parse(text): def tokenize(text): return [token.encode('utf8') for token in utils.tokenize(text, lower=True, errors='ignore') if 2 <= len(token) <= 20 and not token.startswith('_')] global LEMMATIZE if LEMMATIZE: return utils.lemmatize(text) else: return tokenize(text)
def __init__(self, searchPhrase, k=0): self.words = [ word.split('/')[0] for word in lemmatize( cleanText.removeStopWords( cleanText.cleanText(searchPhrase)[0])) ] self.listSearch = {} self.k = k
def tokenize(post): for currPunct in punctuations: post = post.replace(currPunct, "") if bool(emoji.get_emoji_regexp().search(post)): post = emoji.demojize(post) tokens = lemmatize(post) tokens = [str(x).split("/")[0].split('\'')[1] for x in tokens] tokens = [item for item in tokens if not item in stop and item not in add_stop] return tokens
def process_article(args): # override original method in wikicorpus.py text, lemmatize, title, pageid = args text = filter_wiki(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenize(text) return result, title, pageid
def _phrases_in_raw_text_via_lemmatisation(self, raw_text): """ Builds a list of lemmas from raw text using lemmatization. """ all_lemmas = lemmatize(raw_text, allowed_tags=re.compile('(NN|JJ)'), stopwords=STOPWORDS_UNICODE) document_bigrams = self.fetch_document_bigrams(all_lemmas) known_bigrams = [bigram for bigram in document_bigrams if bigram in self.top_bigrams] return (all_lemmas + known_bigrams)
def gensimlemm(texts): texts_out = [] for sent in texts: doc = " ".join(sent) # print(doc) if len(doc) > 0: lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(doc) if wd.decode('utf-8').split('/')[1]=='NN'] texts_out.append(lemmatized_out) return texts_out
def clean_feedback(row): tokenizer = RegexTokenizer() | LowercaseFilter() | IntraWordFilter() | StopFilter() stemmer = StemFilter() combined = row['Feedback'] lemmList = [word.decode('utf-8').split('/')[0] for word in lemmatize(combined)] tokenWords = [token.text for token in tokenizer(combined)] stemWords = [stemmer.stemfn(word) for word in tokenWords] final = tokenWords + lemmList + stemWords return ' '.join(set(final)) # Join by space so it is easy for RegexTokenizer to manage
def GetNounsFromDefinition(definition=str()): nouns_ = [] lemma_ = lemmatize(definition) for word in lemma_: word_pos_ = word.split('/') if word_pos_[1][0] in ['N', 'R', 'J']: nouns_.append(word_pos_[0]) return nouns_
def __init__(self, text): """ :param text: content of document :type text: string """ # d = {<t1, w1>, ... <tm, wm>} self.terms_quantity = Counter( lemma for lemma in lemmatize(text) if lemma[:-3] not in STOPWORDS )
def gensimTest(text): print 'gensim' start = time() lemmas = lemmatize(text) for lemma in lemmas: lemma = lemma.split('/') print lemma[0], lemma[1] end = time() print 'gensim time:', (end-start) print "********************************"
def posNN(text): tokens = [] for word in lemmatize(text): st = word.decode("utf-8").split("/") #print(st) if st[1] == 'NN' or st[1] == 'VB': tokens.append(st[0]) stop = open("stop.txt", "r").read().split("\n") filtered_tokens = [token for token in tokens if token not in stop] return " ".join(filtered_tokens)
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
def english_lemmatizer(text): """ calls the "pattern" module lemmatizer through utils """ result = utils.lemmatize(text) if ONLY_NOUN_VERBS: result = filter(lambda x: x.split('/')[-1] == 'VB' or x.split('/')[-1] == 'NN', result) if ONLY_NOUNS: result = filter(lambda x: x.split('/')[-1] == 'NN', result) if DEBUG: print text print result return result
def __init__(self, searchPhrase, dbname='TwitterDB', query=False, k=0): client = pymongo.MongoClient() self.db = client[dbname] self.words = [ word.split('/')[0] for word in lemmatize( cleanText.removeStopWords( cleanText.cleanText(searchPhrase)[0])) ] self.listSearch = {} self.query = query self.k = k
def process_file_path(file_path): with open(file_path, "r") as file: # last character is a breaking /n article_name = file.readline()[:-1] #remaining lines is doc doc = " ".join(file.readlines()) lemmatized_doc = utils.lemmatize(doc) return article_name, lemmatized_doc
def get_summary(news_link = "http://english.onlinekhabar.com/will-try-to-endorse-medical-education-bill-on-friday-says-speaker.html"): # Getting news content news_source = urllib.request.urlopen(news_link).read() news_soup = bs.BeautifulSoup(news_source,'lxml') news_content = news_soup.find_all('div', class_ = 'oke-content-wrap clearfix') news_portion = news_content[0].find_all('p') news_para = [n.text for n in news_portion] news_para = ' '.join(news_para) news = news_para.split('\n\t')[0] # Get sentences news = news.split('\n') news = ' '.join(news) sentence_tk = sent_tokenize(news) print(sentence_tk) # Lemmatizing sentences (finding root word) tokenized = [] i = 1 for sentence in sentence_tk: print(i) lemmatized_out = [wd.decode('utf-8').split('/')[0] for wd in lemmatize(sentence)] lemmatized_out = ' '.join(lemmatized_out) tokenized.append(lemmatized_out) i = i + 1 print(tokenized) print('\n\n') #News sentences clustering clustering_data = [] for token in tokenized: vec = model.infer_vector(token) clustering_data.append(vec) data_length = len(clustering_data) n_clusters = int(np.floor(data_length/3)) kmeans = KMeans(n_clusters=n_clusters, n_init = 1) kmeans = kmeans.fit(clustering_data) #Getting representative sentences avg = [] for j in range(n_clusters): idx = np.where(kmeans.labels_ == j)[0] avg.append(np.mean(idx)) closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, clustering_data) ordering = sorted(range(n_clusters), key=lambda k: avg[k]) summary = ' '.join([sentence_tk[closest[idx]] for idx in ordering]) #print(summary + '\n\n') #print('Length of original text: ',len(sentence_tk)) #print('Length of summary: ',len(sent_tokenize(summary))) return summary
def process_article(args): """ Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). """ text, lemmatize = args text = filter_wiki(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenize(text) return result
def process_article(args): """ Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). """ text, lemmatize, title, pageid = args text = filter_wiki(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenize(text) return result, title, pageid
def parse(text): def tokenize(text): return [ token.encode('utf8') for token in utils.tokenize(text, lower=True, errors='ignore') if 2 <= len(token) <= 20 and not token.startswith('_') ] global LEMMATIZE if LEMMATIZE: return utils.lemmatize(text) else: return tokenize(text)
def process_post(args): """Normalize an entry into tokens""" content, lemmatize, subject, pageid = args text = url_re.sub('', subject + " " + content) if lemmatize: result = utils.lemmatize(text) else: result = [token.encode('utf8') for token in utils.tokenize(text, lower=True, errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_') ] return result, subject, pageid
def __init__(self, searchPhrase, dbname='TwitterDB', query=None, k=0): self.queries = Queries(dbname) self.words = [word.split('/')[0] for word in lemmatize(cleanText.removeStopWords(cleanText.cleanText(searchPhrase)[0]))] self.idfs = dict() and_list = [] for word in self.words: and_list.append({'words.word': word}) self.query_search = {"$and" : and_list} if query: self.existing = True self.query_search.update(query) else: self.existing = False self.k = k
def get_features(self, document): #create list of tokens from doc logger.debug("Lemmatize document.") tokens = utils.lemmatize(document) #create bow of doc from token list logger.debug("Create bag-of-words representation from article.") doc_bow = self.dictionary.doc2bow(tokens) #create tfidf representation from bag-of-words logger.debug("Transform to tfidf.") doc_tfidf = self.tfidf_model[doc_bow] return doc_tfidf
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """ Parse a wikipedia article, returning its content as a list of tokens (utf8-encoded strings). Set `tokenizer_func` (defaults to `tokenize`) parameter for languages like japanese or thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). """ text, lemmatize, title, pageid = args text = filter_wiki(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenizer_func(text, token_min_len, token_max_len, lower) return result, title, pageid
def get_trans(line, sid, nitems=None, lemma=True, metadata=True, sw=stopwords.words("english")): # logger.info("get_trans") if lemma: # logger.debug("lemma") trans = utils.lemmatize(line, stopwords=sw) else: # logger.debug("no lemma") trans = utils.tokenize(line.replace(".", ""), lowercase=True) trans = " ".join([x.lower() for x in trans]) # if trunc: # trans = " ".join(trans.split()[:trunc]) # print "sw:", sw, "TRANS:", trans if metadata: return trans, (nitems, sid) else: return trans
def get_texts(self): """ Iterate over the Wikipedia dump and the HN articles returning text """ wiki_articles, hn_articles, articles_all = 0, 0, 0 positions, positions_all = 0, 0 # ************ Wikipedia ************ texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file))) pool = multiprocessing.Pool(self.processes) for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory for tokens in pool.imap(wikicorpus.process_article, group): articles_all += 1 positions_all += len(tokens) if len(tokens) > WIKI_ARTICLE_MIN_WORDS: wiki_articles += 1 positions += len(tokens) yield tokens pool.terminate() print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS)) # ************ HN articles ************ positions_after_wiki = positions fnamelist = [] for g in glob.iglob(self.hn_folder + '/*.txt'): fnamelist.append(g) for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki hn_text = open(fname).read() if self.lemmatize: result = utils.lemmatize(hn_text) # text into lemmas here else: result = tokenize(hn_text) # text into tokens here articles_all += 1 positions_all += len(result) if len(result) > HN_ARTICLE_MIN_WORDS: hn_articles += 1 positions += len(result) yield result print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki)) # ************ /HN articles ************ self.length = wiki_articles + hn_articles # cache corpus length
def bowCorpus(root_path): vocab = corpora.dictionary.Dictionary() corpus = [] filenames = [os.path.join(root_path, f) for f in os.listdir(root_path)] print colored(len(filenames), "green"), "files found in", colored(root_path, "green") print "Converting each file into bag-of-word:" for fname in pbar(filenames): with open(fname, "r") as f: content = f.read() tokens = utils.lemmatize(content) # lemmatize return strings like 'moderate/VB' or 'listing/NN' tokens = [x.split("/")[0] for x in tokens] bow = vocab.doc2bow(tokens, allow_update=True) corpus.append(bow) return corpus, vocab
def get_trans(line, sid, nitems=None, lemma=True, metadata=True, sw=stopwords.words("english"), tokens_only=False): if lemma: trans = utils.lemmatize(line, stopwords=sw) else: trans = utils.tokenize(line.replace(".", ""), lowercase=True) if tokens_only: trans = [x.lower() for x in trans] else: try: trans = " ".join([x.lower() for x in trans]) except: logger.error("** get_trans **") logger.error(repr(line)) logger.error(repr(trans)) if metadata: return trans, (nitems, sid) else: return trans
def get_texts(self): """ Files are processed parallel. See wikicorpus.py by Radim Rehurek """ logger = logging.getLogger("feature_extractor") processed_articles = 0 for document in self.corpus: if processed_articles % 1000 == 0: logger.info("Processing article #%d..." % processed_articles) processed_articles += 1 try: tokens = utils.lemmatize(document) yield tokens except Exception as e: logger.error("Could not process article: %s" % e) logger.info("Processed %d articles." % processed_articles)
def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Parse a wikipedia article, extract all tokens. Notes ----- Set `tokenizer_func` (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`) parameter for languages like japanese or thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text: str, token_min_len: int, token_max_len: int, lower: bool). Parameters ---------- args : (str, bool, str, int) Article text, lemmatize flag (if True, :func:`~gensim.utils.lemmatize` will be used), article title, page identificator. tokenizer_func : function Function for tokenization (defaults is :func:`~gensim.corpora.wikicorpus.tokenize`). Needs to have interface: tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. token_min_len : int Minimal token length. token_max_len : int Maximal token length. lower : bool If True - convert article text to lower case. Returns ------- (list of str, str, int) List of tokens from article, title and page id. """ text, lemmatize, title, pageid = args text = filter_wiki(text) if lemmatize: result = utils.lemmatize(text) else: result = tokenizer_func(text, token_min_len, token_max_len, lower) return result, title, pageid
def get_texts(self): ''' Files are processed parallel. See wikicorpus.py by Radim Rehurek ''' logger = logging.getLogger("feature_extractor") processed_articles = 0 for article in Article.objects(): if processed_articles % 1000 == 0: logger.info("Processing article #%d..." % processed_articles) processed_articles += 1 try: doc = article.clean_content tokens = utils.lemmatize(doc) yield tokens except Exception as e: logger.error("Could not process article %s (%s): %s" % (article.id, type(e), e)) logger.info("Processed %d articles." % processed_articles)
import pickle, sys from gensim import utils #article_to_score = '../data/paulgraham.com-startupideas.html.txt' #article_to_score = '../data/paulgraham.com-founder.html.txt' article_to_score = '../data/paulgraham.com-ycombinator.html.txt' text = open(article_to_score, 'r').read() LEMMATIZE = utils.HAS_PATTERN lda = None if LEMMATIZE: f = open('/Users/gabrielsynnaeve/Dropbox/Public/hn_lemmatized.ldamodel', 'r') lda = pickle.load(f) a = utils.lemmatize(text) else: print >> sys.stderr, "ERROR: install pattern" sys.exit(-1) user = '******' if len(sys.argv) > 1: user = sys.argv[1] user_params = None with open(user + '.params') as f: user_params = pickle.load(f) # score \proto P(Like) # P(Like=true) \propto \sum_{t \in Topics}[P(TopicsArticle) # * P(\lambda|t,TopicsArticle) * P(t|Like=true) * P(Like=true)] score = 0.0 for topicid, proba in lda[lda.id2word.doc2bow(a)]:
best10 = bests[topicid][:10] beststrl = [(topic[i], ldaobject.id2word[i]) for i in best10] beststr = " + ".join(["%.3f*%s" % v for v in beststrl]) if LEMMATIZE: print "topic #", topicid, " described by word:", topicnames[topicid].split("/")[0] else: print "topic #", topicid, " described by word:", topicnames[topicid] print beststr f = None if LEMMATIZE: f = open("hn_lemmatized.ldamodel", "r") else: f = open("hn.ldamodel", "r") lda = pickle.load(f) topic_names(lda) article = open("/Users/gabrielsynnaeve/labs/clojure/hackernews/data/99985.txt", "r").read() a = None if LEMMATIZE: a = utils.lemmatize(article) else: a = tokenize(article) print a for topic, proba in lda[lda.id2word.doc2bow(a)]: print lda.show_topic(topic) print proba
doc = " ".join(file.readlines()) except Exception as e: logger.error("Could not load document from %s" % options.text) sys.exit(1) #load dictionary, tfidf model, lda model, esa model logger.info("Load dictionary, tfidf model, lda model and esa model with prefix %s" % options.prefix) dictionary = corpora.Dictionary.load(options.prefix + "_wordids.dict") tfidf_model = models.TfidfModel.load(options.prefix + "_tfidf.model") lda_model = models.LdaModel.load(options.prefix + "_lda.model") esa_model = EsaModel.load(options.prefix + "_esa_on_lda.model") #create list of tokens from doc logger.info("Lemmatize document.") tokens = utils.lemmatize(doc) #create bow of doc from token list logger.info("Create bag-of-words representation from document.") doc_bow = dictionary.doc2bow(tokens) #create tfidf representation from bag-of-words logger.info("Transform to tfidf.") doc_tfidf = tfidf_model[doc_bow] #create lda representation from tfidf logger.info("Transform to lda") doc_lda = lda_model[doc_tfidf] #create esa representation from lda logger.info("Transform to esa")
from gensim.utils import lemmatize x = lemmatize('Nic nemůže letět rychlostí vyšší, než 300 tisíc kilometrů za sekundu!') print(x)
from gensim.models import Word2Vec from gensim.utils import lemmatize from gensim.parsing.preprocessing import STOPWORDS from nltk.corpus import stopwords from collections import Counter print("Reading input file 'input/audits_with_content.csv'") with open('input/audits_with_content.csv', 'r') as f: reader = csv.reader(f) raw_documents = list(reader) print("Prepare documents") documents = [doc[2] for doc in raw_documents if doc[2] != ''] sentences = [] bigram = Phrases() for document in documents: raw_text = document.lower() tokens = lemmatize(raw_text, stopwords=STOPWORDS) sentences.append(tokens) bigram.add_vocab([tokens]) bigram_counter = Counter() for key in bigram.vocab.keys(): if key not in stopwords.words("english"): if len(key.split("_")) > 1: bigram_counter[key] += bigram.vocab[key] for key, counts in bigram_counter.most_common(200): print '{0: <20} {1}'.format(key.encode("utf-8"), counts)
froms = [] dates = [] for index, document in documents.items(): count += 1 if count > max_doc: break print '\r', count, '/', doc_num, text = document['text'] + (' ' + index) * title_weight # incorporate title information from_name = document['from'] date = document['date'] cleaned = clean_text(text) # delete irrelevant characters document = [] tokens = lemmatize(content=cleaned, allowed_tags=allowed_pos) # lemmatize for token in tokens: word, pos = token.split('/') document.append(word) # convert compound word into one token document = convert_compound(document) # filter stop words, long words, and non-english words document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()] new_documents.append(document) titles.append(index) froms.append(from_name) dates.append(date)