def stem_tokenize(doc, deacc=True, lowercase=True, errors="strict", stemmer=None): """ Split into words and stem that word if a stemmer is given""" if stemmer is None: for token in tokenize(doc, lowercase=lowercase, deacc=deacc, errors=errors): yield token else: for token in tokenize(doc, lowercase=lowercase, deacc=deacc, errors=errors): yield stemmer.stemWord(token)
def convert(self, text): from gensim.utils import tokenize from numpy import asarray if isinstance(text, str): docs = [tokenize(text, to_lower=True, deacc=True)] else: docs = [tokenize(t, to_lower=True, deacc=True) for t in text] return [asarray([self(t) for t in doc], dtype='int32') for doc in docs]
def tokenize(self,content, BytesOrNot=False): """ Tokenize a piece of text. Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer that 15 characters (not bytes!). """ # https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/corpora/wikicorpus.py#L166 # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) if BytesOrNot: return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_')] # return a list of bytes of characters else: return list(utils.tokenize(content, lower=True, errors='ignore')) # return a list of strings
def characters(self, text): # for text, we only care about tokens directly within the <p> tag if self.path[-1] == 'p': tokens = [ token.encode('utf8') for token in utils.tokenize(text, errors='ignore') if not token.isdigit() ] self.tokens.extend(tokens)
def tokenize(self, document): """ Break text into sentences and each sentence into a list of single words Ignore any token that falls into the stopwords set. """ # use sentence tokenizer sent_tokenize from nltk package sentences = sent_tokenize(utils.to_unicode(document.lower())) # create stemmer of class SnowballStemmer stemmer = SnowballStemmer("english") for sentence in sentences: words = [word for word in utils.tokenize( self.cleanse_text(sentence) )] if self.remove_stopwords: words = [ word for word in words if word not in self.en_stopwords ] if self.stemming: words = [stemmer.stem(t) for t in words] yield words
def iter_documents(top_directory): numFound = 0 for root,dirs,files in os.walk(top_directory): for dir1 in filter(lambda newspaper: newspaper != "TheCharlestonMercury-incomplete" and newspaper != "VincennesCourant" , dirs): #for dir1 in filter(lambda newspaper: newspaper == "TheCharlestonMercury-incomplete" or newspaper == "VincennesCourant" , dirs): #print(dir1) for root2, dirs2,files2 in os.walk(top_directory + "/" + dir1): #print(files2) for dir2 in dirs2: for root3, dirs3, files3 in os.walk(top_directory + "/" + dir1 + "/" + dir2): for file1 in filter(lambda filee: filee.endswith('.txt'),files3): #print('hi') document = open(os.path.join(root,dir1, dir2,file1)).read() newline = str(numFound) + "," + dir1 + "/" + dir2 + "/" + file1 + "," mdfile = open("accessible/" + dir1 + "/" + dir2 + "/" + file1[:len(file1) - 3] + "md", "r") lines = mdfile.readlines() for line in lines: newline += line.split(", ")[1].strip("\n") + "," aFile.write(newline[:len(newline) - 1] + "\n") stoplist = set('for a of the and to in'.split()) resultwords = [word for word in document.split() if word.lower() not in stoplist] result = ' '.join(resultwords) cleanedwords = [re.subn("[^a-zA-Z]+", ' ', word)[0] for word in result if '-' not in word] resultfinal = ''.join(cleanedwords) words = [word.strip() for word in resultfinal.split()] final = ' '.join(words) numFound += 1 yield utils.tokenize(resultfinal, lower=True)
def build_word_vector(n=0, mincount=1): sbcs = texeval_corpus.test_subcorpora sbc = sbcs[n] corpus_name = 'WIKI_'+sbc sentences = [] current_term = "" with io.open(corpus_name, 'r', encoding='utf8') as fin: for line in fin: if '\t' in line: current_term = line.strip().split('\t')[1] if line.strip().endswith('.'): if current_term in line: if ' is a ' in line: line = line.replace(' is a ', ' is_a ') if ' is an ' in line: line = line.replace(' is an ', ' is_a ') # Single tokenize terms. depunct_term = "".join(['_' if ch in string.punctuation or ch == ' ' else ch for ch in current_term]) line = line.replace(current_term, depunct_term).lower() sentences.append(list(tokenize(line))) bigram_transformer = Phrases(sentences) model = Word2Vec(bigram_transformer[sentences], size=100, window=5, min_count=mincount, workers=3, iter=100) model.save(corpus_name+'.100epochs.phrasal.singletok.min'+str(mincount)+'.deep')
def tokenize(s, tokenizer): """ Tokenizes a string. Returns a different list of tokens depending on which tokenizer is used. :param s: string to be tokenized :type s: str :param tokenizer: identifies tokenizer to use :type tokenizer: str :return: list of tokens :rtype: [] """ tokens = (twokenize.tokenize(s) if tokenizer is 'twokenize' else (utils.tokenize(s, lower=True) if tokenizer is 'gensim' else (TweetTokenizer(preserve_case=False)).tokenize(s))) # list of symbols that can end sentences. twokenize has found these to not be attached to another token. # (safe to remove) punct = r'.,!!!!????!:;' # NLTK english stopwords stoplist = stopwords.words('english') result = [tok.lower() for tok in tokens if tok not in punct] result = [tok for tok in result if tok not in stoplist] return result
def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Tokenize a piece of text from wikipedia. Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. Parameters ---------- content : str String without markup (see :func:`~gensim.corpora.wikicorpus.filter_wiki`). token_min_len : int Minimal token length. token_max_len : int Maximal token length. lower : bool If True - convert `content` to lower case. Returns ------- list of str List of tokens from `content`. """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [ utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore') if token_min_len <= len(token) <= token_max_len and not token.startswith('_') ]
def tokenize_by_word(text): """Tokenize input text. Before tokenizing transforms text to lower case and removes accentuation and acronyms set :const:`~gensim.summarization.textcleaner.AB_ACRONYM_LETTERS`. Parameters ---------- text : str Given text. Returns ------- generator Generator that yields sequence words of the given text. Example ------- >>> from gensim.summarization.textcleaner import tokenize_by_word >>> g = tokenize_by_word('Veni. Vedi. Vici.') >>> print(next(g)) veni >>> print(next(g)) vedi >>> print(next(g)) vici """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) return tokenize(text_without_acronyms, to_lower=True, deacc=True)
def clean_text_by_word(text, deacc=True): """Tokenize a given text into words, applying filters and lemmatize them. Parameters ---------- text : str Given text. deacc : bool, optional Remove accentuation if True. Returns ------- dict Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. Example ------- >>> from gensim.summarization.textcleaner import clean_text_by_word >>> clean_text_by_word("God helps those who help themselves") {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list(tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [join_words(word_list, "") for word_list in preprocess_documents(original_words)] if HAS_PATTERN: tags = tag(join_words(original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return {unit.text: unit for unit in units}
def get_texts(self): with utils.smart_open(self.datafile) as inputfile: for line in inputfile: for f in self.preprocess: line = f(line) text = list(utils.tokenize(line, deacc=True, lowercase=True)) yield text
def _tokenize_text_file(fname): with open(fname, "r") as f: doc = f.read() for word in utils.tokenize(doc, lowercase=True): if word not in STOPWORDS_SET: yield word
def get_similarity_list(new_doc): new_doc = utils.tokenize(new_doc) new_doc_bow = pubmed_corpus_lsi.corpus.corpus.dictionary.doc2bow(new_doc) new_doc_tfidf = pubmed_tfidf[new_doc_bow] new_doc_lsi = pubmed_lsi[new_doc_tfidf] new_doc_sims = pubmed_sim[new_doc_lsi] return new_doc_sims
def get_texts(self): for filename in self.input: root = ET.fromstring(open(filename).read()) lang = root.attrib['lang'].lower() genre = root.attrib['type'] tree = ET.ElementTree(root) yield tokenize(clean(open(filename).read(),lang,genre,tree))
def tokenizer2(d): """ Tokenizer that returns a dictionary of the stemmed tokens, with the list of words that were transformed into that token. :param d: the document (text) to be tokenized :type d: unicode :rtype dict of tuple """ def myreducer(d, t): """ Receives a dictionary and a tuple of stem and word list. Adds {stem: [currList] + word list} to it. :param d: the dictionary :type d: dict :param t: the tuple, which should be (stem, word list) :type t: tuple :rtype dict """ try: d[t[0]] += [t[1]] except KeyError: d[t[0]] = [t[1]] except: print t raise return d dic = {} if len(d): lW = [re.sub('[ _]+', ' ', w).strip() for w in utils.tokenize(d) if len(re.sub('[ _]+', ' ', w).strip())] lS = [mystem(w) for w in lW] lS, lW = processBiGrams(lS, lW) l = [(s, w) for s, w in zip(lS, lW)] if len(l) and usesVocab([w[0] for w in l]): l2 = removeVocab(l) if len(l2): dic = reduce(myreducer, l2, {}) return dic
def tokenize(self, content): """ Tokenization according to Wikipedia corpus, where any token less than 2 characters long and greater than 15 characters long is ignored. The token must not start with '_'. """ return [token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_')]
def parse_paragraphs(self): for tag in self.paragraph_parse_tags: for element in self.content.find_all(tag): text = element.get_text(' ', strip=True).encode('ascii', "ignore") # TODO: remove hyperlinks text = " ".join(tokenize(text, lowercase=True)) if text != '': self.paragraphs += [text]
def cleanText(text): plain_text = text.lower().replace("\n"," ").replace('ieee transactions on magnetics','') plain_text = plain_text.replace('ieee','').replace('abstract','') result = list() for word in tokenize(plain_text): if word not in stoplist and re.search("[a-z]", word) and len(word) > 2: result.append(word.encode("utf8")) return result
def process_review(review): #return [token.encode('utf8') for token in utils.tokenize(review, lower=True, errors='ignore') # if 2 <= len(token) <= 15] tokens = [token.encode('utf8') for token in utils.tokenize(review, lower=True, errors='ignore') if 2 <= len(token) <= 15] tokens = [norm(token) for token in tokens if norm(token)] tokens = [token for token in tokens if token not in stwords] tokens = [stemmer.stem(token) for token in tokens if stemmer.stem(token)] return tokens
def getTokensFromEntry(entry): if stopwords is None: pass text = entry.get("review/text") if text is None: print("Empty Document") return ["None"] tokens= utils.tokenize(text, lower=True, errors='ignore') return tokens
def get_texts(self): length = 0 self.input.seek(0) for line in self.input: length += 1 line = re.sub(r"</?s>", "", line) line = line.rstrip("\n") yield utils.tokenize(line) self.length = length
def get_texts(self): """ Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). # Yield each document as a list of lowercase tokens, via `utils.tokenize`. with self.getstream() as lines: for lineno, line in enumerate(lines): if self.metadata: yield utils.tokenize(line, lowercase=True), (lineno,) else: yield utils.tokenize(line, lowercase=True)
def get_texts(self): for path in self.filepaths: with codecs.open(path, encoding='utf8') as f: raw_text = f.read() raw_text = raw_text.lower() for filt in self.preprocess: raw_text = filt(raw_text) text = list(utils.tokenize(raw_text, deacc=True, lowercase=True)) yield text
def _preprocess_text(text, stem=False): """ Performs common atomic operations on one text chunk - tokenization, normalization :param text: :return: """ words = filter(lambda x: x not in STOPS, map(lambda x: x.lower(), tokenize(text))) if stem: porter = PorterStemmer() words = map(porter.stem, words) return words
def add(self, text): if text is None: return from gensim.utils import tokenize if isinstance(text, str): docs = [tokenize(text, to_lower=True)] else: docs = [tokenize(t, to_lower=True) for t in text] for doc in docs: for t in doc: if t in self._token_counts: self._token_counts[t] += 1 else: self._token_counts[t] = 1 self.id2token.append(t) self.token2id[t] = self._id self._id += 1
def get_texts(self): text = "" for index in self.input[0]: root = ET.fromstring(open(self.input[1][index]).read()) lang = root.attrib['lang'].lower() genre = root.attrib['type'] tree = ET.ElementTree(root) string = clean(open(self.input[1][index]).read(),lang,genre,tree) text += string yield tokenize(text)
def get_texts(self,raw=False): """ yield raw text or tokenized text """ for j in self.get_json(): text = j["text"] if raw: yield text else: yield utils.tokenize(text, deacc=True, lowercase=True)
def tokenize(content): """ Tokenize a piece of text from wikipedia. The input string `content` is assumed to be mark-up free (see `filterWiki()`). Return tokens as utf8 bytestrings. """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [token.encode('utf8') for token in utils.tokenize(content, lower = True, errors = 'ignore') if len(token) <= 15 and not token.startswith('_')]
def tokenize(content): """ Tokenize a piece of text from wikipedia. The input string `content` is assumed to be mark-up free (see `filter_wiki()`). Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer that 15 characters (not bytes!). """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [token.encode('utf8') for token in utils.tokenize(content, lower=False, errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_')]
def get_texts(self): stoplist = set('for a of the and to in'.split()) # add http? for fname in os.listdir(self.dirname): W = [] for line in open(os.path.join(self.dirname, fname)): line = re.sub(' "source":(.[^,]+)",', '', line) # remove json.loads corrupters w = json.loads(line) # tokenize and remove common words w = utils.tokenize(w['text'], lowercase=True) w = [word for word in w if word not in stoplist] W.extend(w) yield W
def tokenize(content): """ Tokenize a piece of text from wikipedia. The input string `content` is assumed to be mark-up free (see `filter_wiki()`). Return list of tokens as utf8 bytestrings. Ignore words shorted than 2 or longer that 15 characters (not bytes!). """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [ token.encode('utf8') for token in utils.tokenize(content, lower=True, errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_') ]
def tokenize(content, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """ Tokenize a piece of text from wikipedia. The input string `content` is assumed to be mark-up free (see `filter_wiki()`). Set `token_min_len`, `token_max_len` as character length (not bytes!) thresholds for individual tokens. Return list of tokens as utf8 bytestrings. """ # TODO maybe ignore tokens with non-latin characters? (no chinese, arabic, russian etc.) return [ utils.to_unicode(token) for token in utils.tokenize(content, lower=lower, errors='ignore') if token_min_len <= len(token) <= token_max_len and not token.startswith('_') ]
def get_texts(self): """ Iterate over the collection, yielding one document at a time. A document is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. Override this function to match your input (parse input files, do any text preprocessing, lowercasing, tokenizing etc.). There will be no further preprocessing of the words coming out of this function. """ # Instead of raising NotImplementedError, let's provide a sample implementation: # assume documents are lines in a single file (one document per line). # Yield each document as a list of lowercase tokens, via `utils.tokenize`. for lineno, line in enumerate(getstream(self.input)): yield utils.tokenize(line, lowercase=True)
def train_embedding(self): print("\nTraining embedding\n") encoder = {self.pad_word: 0, self.unk_word: 1} # decoder = {0.0: self.pad_word, 1.0: self.unk_word} probs = {self.pad_word: 1, self.unk_word: 1} data = tfds.load('imdb_reviews/plain_text', split='unsupervised', data_dir=DATA_DIR) total_words = 2 # pad and unknown entry_count = 2 max_len = 0 for text in tfds.as_numpy(data): tokens = list(tokenize(str(text), lowercase=True))[3:] for idx, word in enumerate(tokens): total_words += 1 if not word in encoder.keys(): entry_count += 1 encoder[word] = entry_count probs[word] = 1 else: probs[word] += 1 if idx > max_len: max_len = idx print(f"The vocabulary size is {entry_count}") print(f"The maximum length of a review is {max_len}") probs = {k: v / total_words for k, v in probs.items()} probs[self.pad_word] = 1 - np.finfo(np.float32).eps probs[self.unk_word] = np.finfo(np.float32).eps glove2word2vec(self.glove_input_file, self.model_file) model = KeyedVectors.load_word2vec_format(self.model_file, binary=False) print("Creating matrix") skipped_words = 0 emb_matrix = np.zeros((entry_count, self.vec_len), dtype=np.float32) for i, word in enumerate(encoder.keys()): try: emb_matrix[i] = model[word] except: skipped_words += 1 pass print(f"Skipped {skipped_words} out of {entry_count}") np.save(open(self.matrix_file, 'wb'), emb_matrix) pickle.dump(encoder, open(self.encoder_file, 'wb'), protocol=0) pickle.dump(probs, open(self.probs_file, 'wb'), protocol=0) return encoder, probs, emb_matrix
def _clean_text(self, the_tweet_text): cleaned_text = p.clean(the_tweet_text).lower().replace("’", "'") words = cleaned_text.split() reformed = [ CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words ] cleaned_text = " ".join(reformed) cleaned_text = cleaned_text.translate( str.maketrans('', '', string.punctuation)) cleaned_text = self._removeNonAscii(cleaned_text) tokenized_text = list(tokenize(cleaned_text)) tokenized_text = self._remove_stops(tokenized_text) return tokenized_text
def process_post(args): """Normalize an entry into tokens""" content, lemmatize, subject, pageid = args text = url_re.sub('', subject + " " + content) if lemmatize: result = utils.lemmatize(text) else: result = [ token.encode('utf8') for token in utils.tokenize(text, lower=True, errors='ignore') if 2 <= len(token) <= 15 and not token.startswith('_') ] return result, subject, pageid
def fasttext_model_train(data, from_scratch): # Preprocessing like stopword removal @TODO ge_sentences = [list(tokenize(s)) for s in data['text'].to_list()] if from_scratch: model = FastText(bucket=1000000, window=3, min_count=1, size=300) model.build_vocab(sentences=ge_sentences) model.train(sentences=ge_sentences, total_examples=len(ge_sentences), epochs=10) else: print("salam") model = FastText.load_fasttext_format('content/cc.en.300') model.build_vocab(ge_sentences, update=True) # model.train(sentences=ge_sentences, total_examples = len(sent), epochs=5) return model
def __iter__(self): if self._corpus_fpath.endswith(".gz"): corpus = gzip.open(self._corpus_fpath, "r", "utf-8") else: corpus = codecs.open(self._corpus_fpath, "r", "utf-8") for line in corpus: yield list( tokenize(line, lowercase=False, deacc=False, encoding='utf8', errors='strict', to_lower=False, lower=False))
def get_texts(self): i = 0 for fn in self.input: if i > 100: break i += 1 text = open(fn, 'r').read() #yield [CorpusNTA.lmtzr.lemmatize(word) for word in list(utils.tokenize(text, deacc=True, lower=True)) if word not in CorpusNTA.stoplist] yield [ word for word in list(utils.tokenize(text, deacc=True, lower=True)) if word not in CorpusNTA.stoplist ] if i % 100 == 0: print("%d documents processed" % i)
def iter_documents(top_directory): numFound = 0 for root, dirs, files in os.walk(top_directory): for dir1 in filter( lambda newspaper: newspaper != "TheCharlestonMercury-incomplete" and newspaper != "VincennesCourant", dirs): #for dir1 in filter(lambda newspaper: newspaper == "TheCharlestonMercury-incomplete" or newspaper == "VincennesCourant" , dirs): #print(dir1) for root2, dirs2, files2 in os.walk(top_directory + "/" + dir1): #print(files2) for dir2 in dirs2: for root3, dirs3, files3 in os.walk(top_directory + "/" + dir1 + "/" + dir2): for file1 in filter( lambda filee: filee.endswith('.txt'), files3): #print('hi') document = open( os.path.join(root, dir1, dir2, file1)).read() newline = str( numFound ) + "," + dir1 + "/" + dir2 + "/" + file1 + "," mdfile = open( "accessible/" + dir1 + "/" + dir2 + "/" + file1[:len(file1) - 3] + "md", "r") lines = mdfile.readlines() for line in lines: newline += line.split(", ")[1].strip( "\n") + "," aFile.write(newline[:len(newline) - 1] + "\n") stoplist = set('for a of the and to in'.split()) resultwords = [ word for word in document.split() if word.lower() not in stoplist ] result = ' '.join(resultwords) cleanedwords = [ re.subn("[^a-zA-Z]+", ' ', word)[0] for word in result if '-' not in word ] resultfinal = ''.join(cleanedwords) words = [ word.strip() for word in resultfinal.split() ] final = ' '.join(words) numFound += 1 yield utils.tokenize(resultfinal, lower=True)
def create_bow_corpus(textFile, dictionary, outputDir): """" Creates a Gensim bag-of-words corpus from a Gensim dictionary and saves it textFile (string): Location of the specified text file used in dictionary generation dictionary (Gensim dictionary object): Dictionary object used in corpus generation outputDir (string): Location to save the corpus """ outputFile = outputDir + 'DBLP_Corpus.mm' with open(textFile, encoding='utf-8') as (iFile): corpus = [ dictionary.doc2bow((tokenize(line)), allow_update=True) for line in iFile ] corpora.MmCorpus.serialize(outputFile, corpus) print('Corpus created and stored at: ' + outputFile) return corpus
def get_text_content(root): path = './body/body.content/block[@class="full_text"]' full_text = root.find(path) if (full_text is None): return (None, 0) text = ''.join(full_text.itertext()).strip() # Ditch lead paragraph lines = text.split("\n") if lines[0].startswith('LEAD:'): lines = lines[1:-1] text = ' '.join(lines) # with open("corpora/scratch.txt", "a") as f: # f.write(text) # f.write("\n---\n") tokens = [t.lower() for t in tokenize(text)] wc = len(tokens) return (' '.join(tokens), wc)
def body_topic(dataframe): text_body = dataframe['body'].values text_body = [remove_stopwords(body) for body in text_body] text_body = [ tokenize(body, deacc="True", lowercase="True") for body in text_body ] text_body = [[snow.stem(token) for token in word_list] for word_list in text_body] dataframe['tokens'] = [list(gen) for gen in text_body] dataframe['corpus'] = [ dictionary.doc2bow(doc) for doc in dataframe['tokens'] ] dataframe['predicted_topic'] = [ probs_to_topic(topic_probs) for topic_probs in topic_model.get_document_topics(dataframe['corpus']) ] return dataframe
def preprocess(content): """ params -: raw text scrapped from website return -: return list of words after: 1) tokenization 2) remove stopwords and some insignificant words 3) convert in lowercase 4) lemmatize 5) Remove common web terms """ content = tokenize(content, deacc=True) content = list(filter(is_significant, content)) content = [token.lower() for token in content] MIN_WORDS = 30 #minimum words needed to decide whether site is english or not if len(content) > MIN_WORDS and not is_english(content): return ['invalidcontentfound'] #signal for non_engish site content = [lemmatize(token) for token in content if token not in STOPWORDS and token in dictionary] content = [token for token in content if token not in AVOID] return content
def simple_preprocess(doc: str, lower: bool = False, deacc: bool = False, min_len: int = 2, max_len: int = 15) -> List[str]: r""" Gensim's simple_preprocess adding a 'lower' param to indicate wether or not to lower case all the token in the texts For more informations see: https://radimrehurek.com/gensim/utils.html """ tokens = [ token for token in tokenize(doc, lower=False, deacc=deacc, errors='ignore') if min_len <= len(token) <= max_len and not token.startswith('_') ] return tokens
def gen_vocab(tweets): vocab, reverse_vocab = {}, {} vocab_index = 1 for tweet in tweets: text = tokenize(tweet.lower()) text = ' '.join([c for c in text if c not in punctuation]) words = text.split() words = [word for word in words if word not in STOPWORDS] for word in words: if word not in vocab: vocab[word] = vocab_index reverse_vocab[ vocab_index] = word # generate reverse vocab as well vocab_index += 1 vocab['UNK'] = len(vocab) + 1 reverse_vocab[len(vocab)] = 'UNK' return vocab
def lemmatize(content): """ Use the English lemmatizer from `pattern` to extract tokens in their base form=lemma, e.g. "are, is, being" -> "be" etc. This is a smarter version of stemming, taking word context into account. Only considers nouns, verbs, adjectives and adverbs by default (=all other lemmas are discarded). """ content = u' '.join(utils.tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if 2 <= len(lemma) <= 15 and not lemma.startswith('_'): if utils.ALLOWED_TAGS.match(tag): result.append(lemma.encode('utf8')) return result
def word2vec_classifier(dataset): documents = [] for line in dataset: # Wrapper method for tokenizing with tokens = tokenize(line[0], lower=True) sentence = LabeledSentence(tokens, line[1]) documents.append(sentence) log.info("Doc2Vec %d lines" % (len(documents))) # Model parameters num_features = 100 min_word_count = 1 num_workers = 8 context = 2 downsampling = 1e-3 d2v_model = Doc2Vec(min_count=min_word_count, window=context, size=num_features, sample=downsampling, workers=num_workers) log.info("Training doc vectors") train_set, test_set = train_test_split(documents, train_size=0.7, test_size=0.3) train_vec = getAvgFeatureVecs(train_set, d2v_model, num_features) test_vec = getAvgFeatureVecs(test_set, d2v_model, num_features) train_vec = Imputer().fit_transform(train_vec) test_vec = Imputer().fit_transform(test_vec) # train model and predict with LinearSVC model = LinearSVC() classifier_fitted = OneVsRestClassifier(model).fit(train_vec, train_set[1]) result = classifier_fitted.predict(test_vec) # output result to csv result.tofile("./d2v_linsvc.csv", sep='\t') # store the model to mmap-able files joblib.dump(model, 'model/%s.pkl' % 'd2v_linsvc') # evaluation label_score = classifier_fitted.decision_function(test_vec) binarise_result = label_binarize(result, classes=class_list) binarise_labels = label_binarize(class_list, classes=class_list) # generate_eval_metrics(binarise_result, 'w2v_linsvc', binarise_labels) generate_report(binarise_result, 'w2v_linsvc', binarise_labels)
def prepare_corpus(dirname, text_cutoff=1000000): underscore = re.compile(r'\_') authors, titles, texts = [], [], [] for filename in sorted(glob.glob(dirname + "/*")): if '_' in filename: author, title = underscore.split( os.path.split(filename)[-1].replace(".txt", ""), maxsplit=1) else: author, title = next(DUMMY_AUTHORS), os.path.basename( filename).replace(".txt", "") authors.append(author) titles.append(title) with open(filename) as infile: texts.append( list( islice(tokenize(infile.read(), lowercase=True, deacc=True), 0, text_cutoff))) return Dataset(texts, titles, authors)
def clean_text_by_word(text, deacc=True): """ Tokenizes a given text into words, applying filters and lemmatizing them. Returns a dict of word -> syntacticUnit. """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list( tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [ join_words(word_list, "") for word_list in preprocess_documents(original_words) ] if HAS_PATTERN: tags = tagger.tag( original_words) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return dict((unit.text, unit) for unit in units)
def tokenize_tr(content, token_min_len=2, token_max_len=50, lower=True): """tokenize words in the corpus """ if lower: lower_map = { ord(u'A'): u'a', ord(u'A'): u'a', ord(u'B'): u'b', ord(u'C'): u'c', ord(u'Ç'): u'ç', ord(u'D'): u'd', ord(u'E'): u'e', ord(u'F'): u'f', ord(u'G'): u'g', ord(u'Ğ'): u'ğ', ord(u'H'): u'h', ord(u'I'): u'ı', ord(u'İ'): u'i', ord(u'J'): u'j', ord(u'K'): u'k', ord(u'L'): u'l', ord(u'M'): u'm', ord(u'N'): u'n', ord(u'O'): u'o', ord(u'Ö'): u'ö', ord(u'P'): u'p', ord(u'R'): u'r', ord(u'S'): u's', ord(u'Ş'): u'ş', ord(u'T'): u't', ord(u'U'): u'u', ord(u'Ü'): u'ü', ord(u'V'): u'v', ord(u'Y'): u'y', ord(u'Z'): u'z' } content = content.translate(lower_map) return [ utils.to_unicode(token) for token in utils.tokenize(content, lower=False, errors='ignore') if token_min_len <= len(token) <= token_max_len and not token.startswith('_') ]
def get_papar_words(author_papers): paper_character = [] for i, paper in enumerate(author_papers): title = [word.lower() for word in tokenize(paper['title'])] abstract = [] keywords = [] text = [] # if 'abstract' in paper.keys() and paper['abstract'] is not None: # abstract=[word.lower() for word in tokenize(paper['abstract'])] if 'keywords' in paper.keys() and paper['keywords'] is not None: keywords = [word.lower() for word in paper['keywords']] text = title + abstract + keywords # 合并title,keywords,abstract text = [ word for word in text if (word not in my_stopwords) and ( word not in stopwords.words('english')) ] paper_character.append(text) return paper_character
def preprocess_text(document): """ Performs advanced preprocessing on a string and returns lemmatized list of tokens. :param document: Document string to be preprocessed :return: List of preprocessed tokens """ stop_words = nltk.corpus.stopwords.words('english') stop_words.extend(CUSTOM_STOP_WORDS) en_stop = set(stop_words) tokens = utils.tokenize(document, lowercase=True, deacc=True) tokens = [ str(token) for token in tokens if (token not in en_stop and not token.startswith('_')) ] tokens = [get_lemma2(token) for token in tokens] return tokens
def __iter__(self): for directory in CowReader.dirs: with codecs.open(os.path.join(CowReader.root, directory, directory + ".xml"), encoding='utf-8') as infile: sentence = [] for line in infile: if line.startswith('<s'): continue elif line.startswith('</s>'): yield sentence sentence = [] else: word, pos, lemma = line.strip().split('\t') if pos not in ('$.', 'punc'): sentence.append(word.lower()) with codecs.open(CowReader.vvb, encoding='utf-8') as vvb: for sentence in vvb: yield list(tokenize(sentence, lowercase=True))
def prepare_text(self, plain_text): tokens = list(tokenize(plain_text)) tokens = [x for x in tokens if x.lower() not in STOPWORDS] plain_text = " ".join(tokens) bigram_mdl = Phrases(tokens, min_count=1, threshold=2) custom_filters = [strip_punctuation, strip_numeric] tokens = preprocess_string(plain_text, custom_filters) tokens = [t for t in tokens if len(t) > 2] bigrams = bigram_mdl[tokens] words = list(bigrams) words = [re.sub('_', '-', word) for word in words] vecs = [ self.word2vec[word] if word in self.word2vec.keys() else np.zeros( shape=(1, 20)) for word in words ] # return list of arrays, each array is vector of a single word return vecs
def get_texts(self): stoplist = set('for a of the and to in'.split()) # add http? for fname in os.listdir(self.dirname): W = [] print(os.path.join(self.dirname, fname)) for line in io.open(os.path.join(self.dirname, fname), 'r', encoding='windows-1252'): line = re.sub(' "source":(.[^,]+)",', '', line) # remove json.loads corrupters line = re.sub("(?<=[^a-z])(')(?=.)|(?<=.)(')(?=[^a-z])", '"', line) w = json.loads(line) # tokenize and remove common words w = utils.tokenize(w['text'], lowercase=True) w = [word for word in w if word not in stoplist] W.extend(w) yield W
def predict(text): x = np.zeros((1, timesteps, 300), dtype=np.float32) tokens = tokenize(text) mj = 0 for w in tokens: if (mj < timesteps): try: x[0][mj] = w2v.word_vec(w) mj += 1 except: continue else: break return model.predict(x)
def clean_text_by_word(text, deacc=True): """Tokenize a given text into words, applying filters and lemmatize them. Parameters ---------- text : str Given text. deacc : bool, optional Remove accentuation if True. Returns ------- dict Words as keys, :class:`~gensim.summarization.syntactic_unit.SyntacticUnit` as values. Example ------- .. sourcecode:: pycon >>> from gensim.summarization.textcleaner import clean_text_by_word >>> clean_text_by_word("God helps those who help themselves") {'god': Original unit: 'god' *-*-*-* Processed unit: 'god', 'help': Original unit: 'help' *-*-*-* Processed unit: 'help', 'helps': Original unit: 'helps' *-*-*-* Processed unit: 'help'} """ text_without_acronyms = replace_with_separator(text, "", [AB_ACRONYM_LETTERS]) original_words = list( tokenize(text_without_acronyms, to_lower=True, deacc=deacc)) filtered_words = [ join_words(word_list, "") for word_list in preprocess_documents(original_words) ] if HAS_PATTERN: tags = tag(join_words( original_words)) # tag needs the context of the words in the text else: tags = None units = merge_syntactic_units(original_words, filtered_words, tags) return {unit.text: unit for unit in units}
def _extract_texts(self, content): # remove all \n # gensim's tokenizer and to lowercase # remove stop words # remove infrequent words try: title = content.get("title", "") comments = content.get("comments", []) comments = map(lambda x: x.replace("\n", " "), comments) raw_texts = "{} {}".format(title, "\n".join(comments)) # tokenize # from nltk.tokenize import RegexpTokenizer # tokenizer = RegexpTokenizer('[a-zA-Z][a-zA-Z0-9]*') tokens = list(tokenize(raw_texts, lower=True, deacc=True)) # stopwords clean_tokens = [t for t in tokens if t not in en_stopwords] # infrequent words texts = " ".join(clean_tokens) return texts except Exception as e: traceback.print_exc()